Source code for cas.anndata_to_cas

import json
from typing import Any, Dict, List

from cas.file_utils import read_anndata_file
from cas.utils.conversion_utils import (
    add_labelsets_to_cas,
    add_parent_cell_hierarchy,
    add_parent_hierarchy_to_annotations,
    calculate_labelset,
    generate_parent_cell_lookup,
    get_authors_from_doi,
    create_accession_mapping
)



[docs]
def anndata2cas(
    anndata_file_path: str,
    labelsets: List[str],
    output_file_path: str,
    include_hierarchy: bool,
    accession_columns: List[str] = None,
):
    """
    Convert an AnnData file to Cell Annotation Schema (CAS) JSON.

    Args:
        anndata_file_path (str): Path to the AnnData file.
        labelsets (List[str]): List of labelsets, which are names of observation (obs) fields used to record author
        cell type names. The labelsets should be provided in order, starting from rank 0 (leaf nodes) and ascending
        to higher ranks.
        output_file_path (str): Output CAS file name.
        include_hierarchy (bool): Flag indicating whether to include hierarchy in the output.
        accession_columns (List[str], optional): List of columns in the AnnData obs that contain accession information.
            If provided, these columns will be used to populate the 'cell_set_accession' field in the CAS annotations.
            Otherwise, accession IDs will be automatically generated using a hash of the cells in each cell set.
            Defaults to None.
    """

    anndata = read_anndata_file(anndata_file_path)

    labelset_dict = calculate_labelset(anndata.obs, labelsets)

    accessions_mapping = create_accession_mapping(anndata.obs, labelsets, accession_columns)

    cas = generate_cas_metadata(dict(anndata.uns))

    add_labelsets_to_cas(cas, labelset_dict)

    parent_cell_look_up = generate_parent_cell_lookup(anndata, labelset_dict, accessions_mapping)

    add_annotations_to_cas(cas, labelset_dict, parent_cell_look_up)

    if include_hierarchy:
        add_parent_cell_hierarchy(parent_cell_look_up)
        add_parent_hierarchy_to_annotations(cas, parent_cell_look_up)

    # Write the JSON data to the file
    with open(output_file_path, "w") as json_file:
        json.dump(cas, json_file, indent=2)




[docs]
def generate_cas_metadata(uns: Dict[str, Any]) -> Dict[str, Any]:
    """
    Generates CAS metadata based on the provided 'uns' dictionary.

    Args:
        uns (Dict[str, Any]): The 'uns' dictionary containing metadata.

    Returns:
        Dict[str, Any]: The generated CAS metadata dictionary.
    """
    # TODO None values will be calculated later on
    matrix_file_id = None
    cellannotation_schema_version = uns["schema_version"]
    cellannotation_timestamp = None
    cellannotation_version = None
    cellannotation_url = None
    title = uns.get("title")
    author_name = (
        "John Doe"  # Adding default author_name as it is required in the schema
    )
    author_contact = None
    orcid = None
    cas_init = {
        "matrix_file_id": matrix_file_id,
        "cellannotation_schema_version": cellannotation_schema_version,
        "cellannotation_timestamp": cellannotation_timestamp,
        "cellannotation_version": cellannotation_version,
        "cellannotation_url": cellannotation_url,
        "title": title,
        "author_name": author_name,
        "author_contact": author_contact,
        "orcid": orcid,
        "annotations": [],
        "labelsets": [],
    }
    # Exclude keys with None values
    cas = {k: v for k, v in cas_init.items() if v is not None}
    return cas




[docs]
def add_annotations_to_cas(
    cas: Dict[str, Any],
    labelset_dict: Dict[str, Any],
    parent_cell_look_up: Dict[str, Any],
):
    """
    Generates CAS annotations based on the provided AnnData object and updates the CAS
    dictionary with new annotations. This function can optionally use a precomputed
    parent cell lookup dictionary to enrich the annotations with hierarchical information.

    Args:
        cas (Dict[str, Any]): The CAS dictionary to be updated with annotations. Expected to have a key
            'annotations' where new annotations will be appended.
        labelset_dict (Dict[str, Any]): A dictionary defining labelsets and their members. This is used to match cell
            labels with their respective metadata and annotations.
        parent_cell_look_up (Dict[str, Any]): A precomputed dictionary containing hierarchical metadata about cell
            labels.

    Returns:
        None: The function directly updates the `cas` dictionary with new annotations. The `parent_cell_look_up` is
        used for enrichment and must be generated beforehand if hierarchical information is to be included.
    """
    for k, v in labelset_dict.items():
        for label in v["members"]:
            labelset = k
            rationale = None
            rationale_dois = None
            marker_gene_evidence = None
            synonyms = None
            category_fullname = None
            category_cell_ontology_exists = None
            category_cell_ontology_term_id = None
            category_cell_ontology_term = None

            anno_init = {
                "labelset": labelset,
                "cell_label": label,
                "cell_fullname": label,
                "cell_set_accession": parent_cell_look_up[f"{labelset}:{label}"][
                    "accession"
                ],
                "cell_ontology_term_id": parent_cell_look_up[f"{labelset}:{label}"][
                    "cell_ontology_term_id"
                ],
                "cell_ontology_term": parent_cell_look_up[f"{labelset}:{label}"][
                    "cell_ontology_term"
                ],
                "cell_ids": list(
                    parent_cell_look_up[f"{labelset}:{label}"]["cell_ids"]
                ),
                "rationale": rationale,
                "rationale_dois": rationale_dois,
                "marker_gene_evidence": marker_gene_evidence,
                "synonyms": synonyms,
                "category_fullname": category_fullname,
                "category_cell_ontology_exists": category_cell_ontology_exists,
                "category_cell_ontology_term_id": category_cell_ontology_term_id,
                "category_cell_ontology_term": category_cell_ontology_term,
            }
            # Exclude keys with None values
            cas.get("annotations").append(
                {k: v for k, v in anno_init.items() if v is not None}
            )