import json
from typing import Any, Dict, List
from cas.file_utils import read_anndata_file
from cas.utils.conversion_utils import (
add_labelsets_to_cas,
add_parent_cell_hierarchy,
add_parent_hierarchy_to_annotations,
calculate_labelset,
generate_parent_cell_lookup,
get_authors_from_doi,
create_accession_mapping
)
[docs]
def anndata2cas(
anndata_file_path: str,
labelsets: List[str],
output_file_path: str,
include_hierarchy: bool,
accession_columns: List[str] = None,
):
"""
Convert an AnnData file to Cell Annotation Schema (CAS) JSON.
Args:
anndata_file_path (str): Path to the AnnData file.
labelsets (List[str]): List of labelsets, which are names of observation (obs) fields used to record author
cell type names. The labelsets should be provided in order, starting from rank 0 (leaf nodes) and ascending
to higher ranks.
output_file_path (str): Output CAS file name.
include_hierarchy (bool): Flag indicating whether to include hierarchy in the output.
accession_columns (List[str], optional): List of columns in the AnnData obs that contain accession information.
If provided, these columns will be used to populate the 'cell_set_accession' field in the CAS annotations.
Otherwise, accession IDs will be automatically generated using a hash of the cells in each cell set.
Defaults to None.
"""
anndata = read_anndata_file(anndata_file_path)
labelset_dict = calculate_labelset(anndata.obs, labelsets)
accessions_mapping = create_accession_mapping(anndata.obs, labelsets, accession_columns)
cas = generate_cas_metadata(dict(anndata.uns))
add_labelsets_to_cas(cas, labelset_dict)
parent_cell_look_up = generate_parent_cell_lookup(anndata, labelset_dict, accessions_mapping)
add_annotations_to_cas(cas, labelset_dict, parent_cell_look_up)
if include_hierarchy:
add_parent_cell_hierarchy(parent_cell_look_up)
add_parent_hierarchy_to_annotations(cas, parent_cell_look_up)
# Write the JSON data to the file
with open(output_file_path, "w") as json_file:
json.dump(cas, json_file, indent=2)
[docs]
def add_annotations_to_cas(
cas: Dict[str, Any],
labelset_dict: Dict[str, Any],
parent_cell_look_up: Dict[str, Any],
):
"""
Generates CAS annotations based on the provided AnnData object and updates the CAS
dictionary with new annotations. This function can optionally use a precomputed
parent cell lookup dictionary to enrich the annotations with hierarchical information.
Args:
cas (Dict[str, Any]): The CAS dictionary to be updated with annotations. Expected to have a key
'annotations' where new annotations will be appended.
labelset_dict (Dict[str, Any]): A dictionary defining labelsets and their members. This is used to match cell
labels with their respective metadata and annotations.
parent_cell_look_up (Dict[str, Any]): A precomputed dictionary containing hierarchical metadata about cell
labels.
Returns:
None: The function directly updates the `cas` dictionary with new annotations. The `parent_cell_look_up` is
used for enrichment and must be generated beforehand if hierarchical information is to be included.
"""
for k, v in labelset_dict.items():
for label in v["members"]:
labelset = k
rationale = None
rationale_dois = None
marker_gene_evidence = None
synonyms = None
category_fullname = None
category_cell_ontology_exists = None
category_cell_ontology_term_id = None
category_cell_ontology_term = None
anno_init = {
"labelset": labelset,
"cell_label": label,
"cell_fullname": label,
"cell_set_accession": parent_cell_look_up[f"{labelset}:{label}"][
"accession"
],
"cell_ontology_term_id": parent_cell_look_up[f"{labelset}:{label}"][
"cell_ontology_term_id"
],
"cell_ontology_term": parent_cell_look_up[f"{labelset}:{label}"][
"cell_ontology_term"
],
"cell_ids": list(
parent_cell_look_up[f"{labelset}:{label}"]["cell_ids"]
),
"rationale": rationale,
"rationale_dois": rationale_dois,
"marker_gene_evidence": marker_gene_evidence,
"synonyms": synonyms,
"category_fullname": category_fullname,
"category_cell_ontology_exists": category_cell_ontology_exists,
"category_cell_ontology_term_id": category_cell_ontology_term_id,
"category_cell_ontology_term": category_cell_ontology_term,
}
# Exclude keys with None values
cas.get("annotations").append(
{k: v for k, v in anno_init.items() if v is not None}
)