import json
import os
from importlib.metadata import version
from typing import Any, Dict
import pandas as pd
from cas.file_utils import read_json_file
CAT_SET_REQUIRED_COLUMNS = ["label", "name", "description", "order"]
CAT_REQUIRED_COLUMNS = [
"cluster_annotation_term_set_label",
"name",
"label",
"parent_term_label",
"cluster_annotation_term_set_name",
]
[docs]
def validate_dataframe_columns(df: pd.DataFrame, required_columns: list):
"""
Validates a DataFrame for the required columns.
Args:
df (pandas.DataFrame): DataFrame to validate.
required_columns (list): List of required column names.
"""
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
raise ValueError(f"Missing columns in DataFrame: {', '.join(missing_columns)}")
[docs]
def generate_catset_dataframe(cas: Dict[str, Any]) -> pd.DataFrame:
"""
Generate a DataFrame representing the Cluster Annotation Term Set (cat_set)
from the given Cell Annotation Schema (CAS) dictionary.
Args:
cas (Dict[str, Any]): The Cell Annotation Schema (CAS) dictionary.
Returns:
pd.DataFrame: DataFrame representing the Cluster Annotation Term Set (cat_set).
"""
labelsets = cas.get("labelsets", [])
data = [
{
"name": labelset.get("name"),
"description": labelset.get("description"),
"order": labelset.get("rank"),
}
for labelset in labelsets
]
catset_df = pd.DataFrame(data)
return catset_df
[docs]
def generate_cat_dataframe(cas: Dict[str, Any]) -> pd.DataFrame:
"""
Generate a DataFrame representing the Cluster Annotation Term (cat) from the given Cell Annotation Schema (CAS)
dictionary.
Args:
cas (Dict[str, Any]): The Cell Annotation Schema (CAS) dictionary.
Returns:
pd.DataFrame: DataFrame representing the Cluster Annotation Term (cat).
"""
columns = [
"label",
"name",
"cluster_annotation_term_set_label",
"parent_term_label",
"parent_term_set_label",
"term_set_order",
"term_order",
"cluster_annotation_term_set_name",
]
annotations = cas.get("annotations")
data = []
for annotation in annotations:
cell_label = annotation.get("cell_label")
cell_set_accession = annotation.get("cell_set_accession")
parent_cell_set_accession = annotation.get("parent_cell_set_accession")
new_row = {
"label": cell_set_accession,
"name": cell_label,
"parent_term_label": parent_cell_set_accession,
}
data.append(new_row)
cat_df = pd.DataFrame(data, columns=columns)
return cat_df
[docs]
def calculate_order_mapping(order_values: pd.Series) -> Dict[str, str]:
"""
Calculate a mapping dictionary based on the order values.
Args:
order_values (pandas.Series): Series containing the order values.
Returns:
Dict[str, str]: Mapping dictionary where keys are order values and values are rank values.
"""
order_values_filtered = order_values[order_values != 0]
order_values_sorted = order_values_filtered.sort_values(ascending=False)
mapping = dict(zip(order_values_sorted, range(len(order_values_sorted))))
return mapping
[docs]
def abc2cas(cat_set_file_path: str, cat_file_path: str, output_file_path: str):
"""
Converts given ABC files to a Cell Annotation Schema (CAS) JSON and writes it to a file with output_file_path name.
Args:
cat_set_file_path: Path to the Cluster Annotation Term Set file.
cat_file_path: Path to the Cluster Annotation Term file.
output_file_path: Output CAS file name (default: output.json).
"""
cat_set = pd.read_csv(cat_set_file_path, sep=",")
cat = pd.read_csv(cat_file_path, sep=",")
validate_dataframe_columns(cat_set, CAT_SET_REQUIRED_COLUMNS)
validate_dataframe_columns(cat, CAT_REQUIRED_COLUMNS)
cas = init_metadata()
add_labelsets(cas, cat_set)
add_annotations(cas, cat)
# Write the JSON data to the file
with open(output_file_path, "w") as json_file:
json.dump(cas, json_file, indent=2)
[docs]
def add_annotations(cas: Dict[str, Any], cat: pd.DataFrame):
"""
Adds annotations to the Cell Annotation Schema (CAS) based on the data from the Cluster Annotation Term DataFrame.
Args:
cas (Dict[str, Any]): Dictionary representing the Cell Annotation Schema.
cat (pd.DataFrame): DataFrame containing Cluster Annotation Term data.
"""
for row in cat.itertuples():
labelset = row.cluster_annotation_term_set_name
cell_label = row.name
cell_fullname = None
cell_ontology_term_id = None
cell_ontology_term = None
cell_ids = None
rationale = None
rationale_dois = None
marker_gene_evidence = None
synonyms = None
category_fullname = None
category_cell_ontology_exists = None
category_cell_ontology_term_id = None
category_cell_ontology_term = None
cell_set_accession = row.label
parent_cell_set_accession = row.parent_term_label
anno = {
"labelset": labelset,
"cell_label": cell_label,
"cell_fullname": cell_fullname,
"cell_ontology_term_id": cell_ontology_term_id,
"cell_ontology_term": cell_ontology_term,
"cell_ids": cell_ids,
"rationale": rationale,
"rationale_dois": rationale_dois,
"marker_gene_evidence": marker_gene_evidence,
"synonyms": synonyms,
"category_fullname": category_fullname,
"category_cell_ontology_exists": category_cell_ontology_exists,
"category_cell_ontology_term_id": category_cell_ontology_term_id,
"category_cell_ontology_term": category_cell_ontology_term,
"cell_set_accession": cell_set_accession,
"parent_cell_set_accession": parent_cell_set_accession,
}
cas.get("annotations").append(
{k: v for k, v in anno.items() if v is not None and not pd.isna(v)}
)
[docs]
def add_labelsets(cas: Dict[str, Any], cat_set: pd.DataFrame):
"""
Adds labelsets to the Cell Annotation Schema (CAS) based on the data from the Cluster Annotation Term Set DataFrame.
Args:
cas (Dict[str, Any]): Cell Annotation Schema dictionary.
cat_set (pandas.DataFrame): DataFrame containing Cluster Annotation Term Set data.
"""
order_mapping = calculate_order_mapping(cat_set["order"])
for row in cat_set.itertuples():
name = row.name
description = row.description
rank = None
if row.order != 0:
rank = order_mapping[row.order]
labelset = {"name": name, "description": description}
if rank is not None:
labelset["rank"] = rank
cas.get("labelsets").append(labelset)
[docs]
def cas2abc(cas_file_path: str, cat_set_file_path: str, cat_file_path: str):
"""
Converts given Cell Annotation Schema (CAS) to ABC files: cluster_annotation_term and
cluster_annotation_term_set, and writes them to files with cat_file_path and cat_set_file_path.
Args:
cas_file_path: Path to the Cell Annotation Schema (CAS) file
cat_set_file_path: Path to the Cluster Annotation Term Set file.
cat_file_path: Path to the Cluster Annotation Term file.
"""
cas_json = read_json_file(cas_file_path)
cluster_annotation_term_set = generate_catset_dataframe(cas_json)
cluster_annotation_term = generate_cat_dataframe(cas_json)
current_directory = os.getcwd()
cluster_annotation_term_set.to_csv(
os.path.join(current_directory, cat_set_file_path), index=False
)
cluster_annotation_term.to_csv(
os.path.join(current_directory, cat_file_path), index=False
)
# TODO implement rest of the method once the requirements are more clear