Source code for cas.spreadsheet_to_cas

import json
import logging
import re
from collections import OrderedDict
from typing import List, Optional

import pandas as pd

from cas.anndata_to_cas import calculate_labelset
from cas.cxg_utils import download_dataset_with_id
from cas.file_utils import read_anndata_file
from cas.utils.conversion_utils import (
    add_labelsets_to_cas,
    add_parent_cell_hierarchy,
    add_parent_hierarchy_to_annotations,
    generate_parent_cell_lookup,
    retrieve_schema,
)

logging.basicConfig(level=logging.INFO)

LABELSET_COLUMN = "CELL LABELSET NAME"
CELL_LABEL_COLUMN = "CELL TYPE TERM"
CL_TERM_COLUMN = "CL TERM"
EVIDENCE_COLUMN = "EVIDENCE"
MARKER_GENES_COLUMN = "MARKER GENES"
SYNONYM_COLUMN = "SYNONYMS"
CATEGORIES_COLUMN = "CATEGORIES"



[docs]
def resolve_ref(schema, ref):
    parts = ref[1:].split("/")
    definition = schema
    for part in parts[1:]:
        definition = definition[part]
    return definition




[docs]
def read_spreadsheet(file_path: str, sheet_name: Optional[str], schema: dict):
    """
    Read the specific sheet from the Excel file into a pandas DataFrame.

    Args:
        file_path (str): Path to the Excel file.
        sheet_name (str, optional): Target sheet name. If not provided, reads the first sheet.
        schema: Cell annotation schema

    Returns:
        tuple: Tuple containing metadata (dict), column names (list), and raw data (pd.DataFrame).
    """
    if sheet_name:
        spreadsheet_df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
    else:
        spreadsheet_df = pd.read_excel(file_path, header=None)

    meta_data = {}

    header_row_index = None
    metadata_properties = {
        k: v
        for k, v in schema["properties"].items()
        if k not in ["labelsets", "annotations"]
    }
    for index, row in spreadsheet_df.iterrows():
        first_cell = str(row[0])
        if first_cell.startswith("#"):
            key = first_cell[1:].strip()
            value = row[1] if pd.notnull(row[1]) else ""
            if key in metadata_properties:
                meta_data[key] = value
                if metadata_properties[key]["type"] == "array":
                    meta_data[key] = re.split("[,|]", value)
        else:
            header_row_index = index
            break

    if header_row_index is not None:
        column_names = spreadsheet_df.iloc[header_row_index, :].tolist()
        raw_data = spreadsheet_df.iloc[header_row_index + 1 :, :]
        raw_data.columns = column_names
        # Iterate through each column and filter out rows with 'cell_type' in any column
        for column in raw_data.columns:
            raw_data = raw_data[raw_data[column] != "cell_type"]
        raw_data = raw_data.where(pd.notnull(raw_data), "")
    else:
        raise ValueError("Header row not found in the spreadsheet.")

    return meta_data, column_names, raw_data




[docs]
def custom_lowercase_transform(s):
    """
    Transforms the given string to lowercase except for words that are acronyms or specific cell type names
    which are three characters or fewer.

    Args:
        s (str): The input string.

    Returns:
        str: The transformed string.
    """

    # Define a function to decide whether a word should stay uppercase
    def transform_word(match):
        word = match.group()
        # If a word is three characters or fewer, it stays uppercase
        if len(word) <= 3:
            return word
        # Otherwise, convert the word to lowercase
        else:
            return word.lower()

    # Use regex to find words and apply the transformation logic
    transformed_string = re.sub(r"\b[A-Za-z]+\b", transform_word, s)

    return transformed_string




[docs]
def spreadsheet2cas(
    spreadsheet_file_path: str,
    sheet_name: Optional[str],
    anndata_file_path: Optional[str],
    labelset_list: Optional[List[str]],
    schema_name: Optional[str],
    output_file_path: str,
):
    """
    Convert a spreadsheet to Cell Annotation Schema (CAS) JSON.

    Args:
        spreadsheet_file_path (str): Path to the spreadsheet file.
        sheet_name (Optional[str]): Target sheet name in the spreadsheet. Can be a string or None.
        anndata_file_path: The path to the AnnData file.
        labelset_list (Optional[List[str]]): List of names of observation (obs) fields used to record author cell
        type names, which determine the rank of labelsets in a spreadsheet.
        schema_name (Optional[str]): Name of the CAS schema, can be one of 'base', 'bican' or 'cap'.
        output_file_path (str): Output CAS file name.
    """
    cell_annotation_schema = retrieve_schema(schema_name if schema_name else "cap")
    meta_data_result, column_names_result, raw_data_result = read_spreadsheet(
        spreadsheet_file_path, sheet_name, cell_annotation_schema
    )

    anndata, matrix_file_id = load_or_fetch_anndata(anndata_file_path, meta_data_result)

    if not labelset_list:
        labelset_list = raw_data_result["labelset"].unique().tolist()

    labelset_dict = calculate_labelset(anndata.obs, labelset_list)

    cas = initialize_cas_structure(matrix_file_id, meta_data_result)

    add_labelsets_to_cas(cas, labelset_dict)

    parent_cell_look_up = generate_parent_cell_lookup(anndata, labelset_dict)

    add_annotations_to_cas(
        cas,
        raw_data_result,
        column_names_result,
        cell_annotation_schema,
        parent_cell_look_up,
    )

    add_parent_cell_hierarchy(parent_cell_look_up)
    add_parent_hierarchy_to_annotations(cas, parent_cell_look_up)

    # Write the JSON data to the file
    with open(output_file_path, "w") as json_file:
        json.dump(cas, json_file, indent=2)




[docs]
def add_annotations_to_cas(cas, raw_data_result, columns, schema, parent_cell_look_up):
    """
    Adds processed annotations from raw data to the CAS structure and tracks labelsets.
    Assumes certain external definitions for column names and transformation functions.

    Args:
        cas (dict): The CAS structure to update with annotations.
        raw_data_result (DataFrame): Raw annotation data.
        columns (list): Column names of raw data to process.
        schema (dict): Cell annotation schema.
        parent_cell_look_up (Dict[str, Any]): A precomputed dictionary containing hierarchical metadata about cell
            labels.

    Returns:
        OrderedDict: Tracks labelsets encountered, initialized to None.

    Note:
        Requires `custom_lowercase_transform`, `get_cell_ids`, and column constants to be defined.
    """
    stripped_data_result = raw_data_result.map(
        lambda x: x.strip() if isinstance(x, str) else x
    )
    for index, row in stripped_data_result.iterrows():
        anno = {}
        user_annotations = {}
        label = row["cell_label"]
        annotation_properties = schema["definitions"]["Annotation"]["properties"]

        for column_name in columns:
            if column_name in annotation_properties:
                anno[column_name] = row[column_name]
                if column_name == "labelset":
                    labelset = anno[column_name]
                if annotation_properties[column_name]["type"] == "array":
                    anno[column_name] = re.split("[,|]", row[column_name])
            elif row[column_name]:
                user_annotations[column_name] = row[column_name]

        anno.update(
            {
                "cell_ids": list(
                    parent_cell_look_up[f"{labelset}:{label}"]["cell_ids"]
                ),
                "cell_set_accession": parent_cell_look_up[f"{labelset}:{label}"][
                    "accession"
                ],
                "cell_ontology_term_id": parent_cell_look_up[f"{labelset}:{label}"][
                    "cell_ontology_term_id"
                ],
                "cell_ontology_term": parent_cell_look_up[f"{labelset}:{label}"][
                    "cell_ontology_term"
                ],
            }
        )
        if user_annotations:
            anno["author_annotation_fields"] = user_annotations
        cas.get("annotations").append(anno)




[docs]
def initialize_cas_structure(matrix_file_id: str, meta_data_result: dict):
    """
    Initializes the Cell Annotation Schema (CAS) structure with basic information and placeholders
    for annotations and labelsets. Fields initialized with None values are omitted in the final output.

    Args:
        matrix_file_id (str): The ID of the matrix file, used within the CAS for identification.
        meta_data_result (dict): Metadata containing at least the 'matrix_file_id' for the CAS URL.

    Returns:
        dict: The initial CAS structure with the matrix file ID, annotation URL, and placeholders
              for future data. Excludes fields that remain None.
    """
    cas_init = {k: v for k, v in meta_data_result.items()}
    cas_init.update(
        {
            "matrix_file_id": f"cxg_dataset:{matrix_file_id}",
            "cellannotation_url": meta_data_result["matrix_file_id"],
            "annotations": [],
            "labelsets": [],
        }
    )
    cas = {k: v for k, v in cas_init.items() if v is not None}
    return cas




[docs]
def load_or_fetch_anndata(anndata_file_path: str, meta_data_result: dict):
    """
    Loads or fetches an AnnData file, based on a local path or a matrix file ID from metadata.

    Args:
        anndata_file_path (str): Path to an AnnData file, or None to fetch using metadata.
        meta_data_result (dict): Metadata with 'matrix_file_id' for fetching the dataset.

    Returns:
        tuple: (AnnData object, matrix file ID), ready for use.

    Raises:
        ValueError: If 'matrix_file_id' is missing from metadata.
    """
    matrix_file_id = (
        meta_data_result["matrix_file_id"].rstrip("/").split("/")[-1].split(".")[0]
    )
    if not anndata_file_path:
        anndata_file_path = download_dataset_with_id(matrix_file_id)
    dataset_anndata = read_anndata_file(anndata_file_path)
    return dataset_anndata, matrix_file_id