Source code for cas.abc_cas_converter

import json
import os
from importlib.metadata import version
from typing import Any, Dict

import pandas as pd

from cas.file_utils import read_json_file

CAT_SET_REQUIRED_COLUMNS = ["label", "name", "description", "order"]
CAT_REQUIRED_COLUMNS = [
    "cluster_annotation_term_set_label",
    "name",
    "label",
    "parent_term_label",
    "cluster_annotation_term_set_name",
]


[docs] def validate_dataframe_columns(df: pd.DataFrame, required_columns: list): """ Validates a DataFrame for the required columns. Args: df (pandas.DataFrame): DataFrame to validate. required_columns (list): List of required column names. """ missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: raise ValueError(f"Missing columns in DataFrame: {', '.join(missing_columns)}")
[docs] def generate_catset_dataframe(cas: Dict[str, Any]) -> pd.DataFrame: """ Generate a DataFrame representing the Cluster Annotation Term Set (cat_set) from the given Cell Annotation Schema (CAS) dictionary. Args: cas (Dict[str, Any]): The Cell Annotation Schema (CAS) dictionary. Returns: pd.DataFrame: DataFrame representing the Cluster Annotation Term Set (cat_set). """ labelsets = cas.get("labelsets", []) data = [ { "name": labelset.get("name"), "description": labelset.get("description"), "order": labelset.get("rank"), } for labelset in labelsets ] catset_df = pd.DataFrame(data) return catset_df
[docs] def generate_cat_dataframe(cas: Dict[str, Any]) -> pd.DataFrame: """ Generate a DataFrame representing the Cluster Annotation Term (cat) from the given Cell Annotation Schema (CAS) dictionary. Args: cas (Dict[str, Any]): The Cell Annotation Schema (CAS) dictionary. Returns: pd.DataFrame: DataFrame representing the Cluster Annotation Term (cat). """ columns = [ "label", "name", "cluster_annotation_term_set_label", "parent_term_label", "parent_term_set_label", "term_set_order", "term_order", "cluster_annotation_term_set_name", ] annotations = cas.get("annotations") data = [] for annotation in annotations: cell_label = annotation.get("cell_label") cell_set_accession = annotation.get("cell_set_accession") parent_cell_set_accession = annotation.get("parent_cell_set_accession") new_row = { "label": cell_set_accession, "name": cell_label, "parent_term_label": parent_cell_set_accession, } data.append(new_row) cat_df = pd.DataFrame(data, columns=columns) return cat_df
[docs] def calculate_order_mapping(order_values: pd.Series) -> Dict[str, str]: """ Calculate a mapping dictionary based on the order values. Args: order_values (pandas.Series): Series containing the order values. Returns: Dict[str, str]: Mapping dictionary where keys are order values and values are rank values. """ order_values_filtered = order_values[order_values != 0] order_values_sorted = order_values_filtered.sort_values(ascending=False) mapping = dict(zip(order_values_sorted, range(len(order_values_sorted)))) return mapping
[docs] def abc2cas(cat_set_file_path: str, cat_file_path: str, output_file_path: str): """ Converts given ABC files to a Cell Annotation Schema (CAS) JSON and writes it to a file with output_file_path name. Args: cat_set_file_path: Path to the Cluster Annotation Term Set file. cat_file_path: Path to the Cluster Annotation Term file. output_file_path: Output CAS file name (default: output.json). """ cat_set = pd.read_csv(cat_set_file_path, sep=",") cat = pd.read_csv(cat_file_path, sep=",") validate_dataframe_columns(cat_set, CAT_SET_REQUIRED_COLUMNS) validate_dataframe_columns(cat, CAT_REQUIRED_COLUMNS) cas = init_metadata() add_labelsets(cas, cat_set) add_annotations(cas, cat) # Write the JSON data to the file with open(output_file_path, "w") as json_file: json.dump(cas, json_file, indent=2)
[docs] def add_annotations(cas: Dict[str, Any], cat: pd.DataFrame): """ Adds annotations to the Cell Annotation Schema (CAS) based on the data from the Cluster Annotation Term DataFrame. Args: cas (Dict[str, Any]): Dictionary representing the Cell Annotation Schema. cat (pd.DataFrame): DataFrame containing Cluster Annotation Term data. """ for row in cat.itertuples(): labelset = row.cluster_annotation_term_set_name cell_label = row.name cell_fullname = None cell_ontology_term_id = None cell_ontology_term = None cell_ids = None rationale = None rationale_dois = None marker_gene_evidence = None synonyms = None category_fullname = None category_cell_ontology_exists = None category_cell_ontology_term_id = None category_cell_ontology_term = None cell_set_accession = row.label parent_cell_set_accession = row.parent_term_label anno = { "labelset": labelset, "cell_label": cell_label, "cell_fullname": cell_fullname, "cell_ontology_term_id": cell_ontology_term_id, "cell_ontology_term": cell_ontology_term, "cell_ids": cell_ids, "rationale": rationale, "rationale_dois": rationale_dois, "marker_gene_evidence": marker_gene_evidence, "synonyms": synonyms, "category_fullname": category_fullname, "category_cell_ontology_exists": category_cell_ontology_exists, "category_cell_ontology_term_id": category_cell_ontology_term_id, "category_cell_ontology_term": category_cell_ontology_term, "cell_set_accession": cell_set_accession, "parent_cell_set_accession": parent_cell_set_accession, } cas.get("annotations").append( {k: v for k, v in anno.items() if v is not None and not pd.isna(v)} )
[docs] def add_labelsets(cas: Dict[str, Any], cat_set: pd.DataFrame): """ Adds labelsets to the Cell Annotation Schema (CAS) based on the data from the Cluster Annotation Term Set DataFrame. Args: cas (Dict[str, Any]): Cell Annotation Schema dictionary. cat_set (pandas.DataFrame): DataFrame containing Cluster Annotation Term Set data. """ order_mapping = calculate_order_mapping(cat_set["order"]) for row in cat_set.itertuples(): name = row.name description = row.description rank = None if row.order != 0: rank = order_mapping[row.order] labelset = {"name": name, "description": description} if rank is not None: labelset["rank"] = rank cas.get("labelsets").append(labelset)
[docs] def init_metadata() -> Dict[str, Any]: """ Initializes metadata for Cell Annotation Schema (CAS). Returns: Dict[str, Any]: Metadata dictionary containing default values for various fields. """ # TODO These needs proper assignments matrix_file_id = None cellannotation_schema_version = version("cell-annotation-schema") cellannotation_timestamp = None cellannotation_version = None cellannotation_url = None author_name = "Jane Doe" # TODO Needs proper author_name assignment author_contact = None orcid = None cas = { "matrix_file_id": matrix_file_id, "cellannotation_schema_version": cellannotation_schema_version, "cellannotation_timestamp": cellannotation_timestamp, "cellannotation_version": cellannotation_version, "cellannotation_url": cellannotation_url, "author_name": author_name, "author_contact": author_contact, "orcid": orcid, "annotations": [], "labelsets": [], } cas = {k: v for k, v in cas.items() if v is not None} return cas
[docs] def cas2abc(cas_file_path: str, cat_set_file_path: str, cat_file_path: str): """ Converts given Cell Annotation Schema (CAS) to ABC files: cluster_annotation_term and cluster_annotation_term_set, and writes them to files with cat_file_path and cat_set_file_path. Args: cas_file_path: Path to the Cell Annotation Schema (CAS) file cat_set_file_path: Path to the Cluster Annotation Term Set file. cat_file_path: Path to the Cluster Annotation Term file. """ cas_json = read_json_file(cas_file_path) cluster_annotation_term_set = generate_catset_dataframe(cas_json) cluster_annotation_term = generate_cat_dataframe(cas_json) current_directory = os.getcwd() cluster_annotation_term_set.to_csv( os.path.join(current_directory, cat_set_file_path), index=False ) cluster_annotation_term.to_csv( os.path.join(current_directory, cat_file_path), index=False )
# TODO implement rest of the method once the requirements are more clear