Source code for blimp.preprocessing.operetta_parse_metadata

"""Copyright 2023 (C) University of New South Wales Original author:

Scott Berry <scott.berry@unsw.edu.au>
"""
from typing import Union
from pathlib import Path
import os
import logging
import xml.etree.ElementTree as ET

import numpy as np
import pandas as pd

logger = logging.getLogger(__name__)

image_metadata_dtypes = {
    "id": str,
    "State": str,
    "URL": str,
    "Row": np.uint8,
    "Col": np.uint8,
    "FieldID": np.uint16,
    "PlaneID": np.uint16,
    "TimepointID": np.int64,
    "ChannelID": np.uint8,
    "FlimID": np.uint8,
    "ChannelName": str,
    "ImageType": str,
    "AcquisitionType": str,
    "IlluminationType": str,
    "ChannelType": str,
    "ImageResolutionX": float,
    "ImageResolutionY": float,
    "ImageSizeX": np.uint16,
    "ImageSizeY": np.uint16,
    "BinningX": np.uint8,
    "BinningY": np.uint8,
    "MaxIntensity": int,
    "CameraType": str,
    "PositionX": float,
    "PositionY": float,
    "PositionZ": float,
    "AbsPositionZ": float,
    "MeasurementTimeOffset": float,
    "AbsTime": "datetime64[ns]",
    "MainExcitationWavelength": np.uint16,
    "MainEmissionWavelength": np.uint16,
    "ObjectiveMagnification": np.uint8,
    "ObjectiveNA": float,
    "ExposureTime": float,
    "OrientationMatrix": str,
    "StandardFieldID": np.uint16,
}


def _remove_ns(s: str) -> str:
    """Strip text before "}".

    Parameters
    ----------
    s : string
        string to be modified

    Returns
    str
        string with text up to and including "}" removed

    Examples
    --------
    >>> _remove_ns("{http://www.perkinelmer.com/PEHH/HarmonyV5}Plates")
    'Plates'
    """
    return s.split("}")[1][0:]


def _xml_to_df(xmls, ns_key: str, ns_dict: dict) -> pd.DataFrame:
    """Convert a list of xmls into a pandas dataframe, where each xml forms a
    row.

    Parameters
    ----------
    xmls
        XML element tree to parse
    ns_key
        namespace key to use
    ns_dict
        namespace dictionary

    Returns
    -------
    pandas.DataFrame
        Dimensions (n_rows = length(xmls), n_cols = # fields)

    Examples
    --------
    >>> idx_path = image_dir / 'Images' / 'Index.idx.xml'
    >>> idx_xml = ET.parse(idx_path).getroot()
    >>> ns = {'harmony': "http://www.perkinelmer.com/PEHH/HarmonyV5"}
    >>> plates_xml = idx_xml.find('harmony:Plates',namespaces=ns).findall('harmony:Plate',namespaces=ns)
    >>> _xml_to_df(plates_xml,"harmony",ns)
    """
    metadata = []
    for xml in xmls:
        # get column names from tags
        xml_tags = [_remove_ns(field.tag) for field in xml]
        # read metadata using a dict comprehension
        metadata.append({xml_tag: xml.find(ns_key + ":" + xml_tag, namespaces=ns_dict).text for xml_tag in xml_tags})
    # convert to dataframe
    return pd.DataFrame(metadata)


def _to_well_name(row: int, column: int) -> str:
    """Convert row and column numbers to well name.

    Parameters
    ----------
    row : int
        Row of the plate
    column : int
        Column of the plate

    Returns
    -------
    str
        Well name

    Examples
    --------
    >>> _to_well_name(1,13)
    A13
    """
    return chr(96 + row).upper() + "%0.2d" % column



[docs]
def get_plate_metadata(metadata_file: Union[str, Path], out_file: Union[str, Path, None] = None) -> pd.DataFrame:
    """Extracts plate metadata from the operetta xml file.

    Parameters
    ----------
    metadata_file
        path to the xml metadata file
    out_file
        enter a file path if this dataframe should be written to file
        (possible extensions are .csv or .pkl)

    Returns
    -------
    pandas.DataFrame
        Dimensions (n_rows = length(xmls), n_cols = # xml fields)
    """
    # define xml namespace
    ns = {"harmony": "http://www.perkinelmer.com/PEHH/HarmonyV5"}

    # get xml
    metadata_xml = ET.parse(metadata_file).getroot()
    plates_xml = metadata_xml.find("harmony:Plates", namespaces=ns)
    if isinstance(plates_xml, ET.Element):
        plates_xml = plates_xml.findall("harmony:Plate", namespaces=ns)
    else:
        logger.error("Operetta plate metadata XML not parsed correctly")
        os._exit(1)

    # convert to dataframe
    plate_metadata = _xml_to_df(plates_xml, "harmony", ns)

    # write file if requested
    if out_file is not None:
        logger.info(f"Save plate metadata to file: {str(out_file)}")
        if Path(out_file).suffix == ".csv":
            plate_metadata.to_csv(out_file, index=False)
        elif Path(out_file).suffix == ".pkl":
            plate_metadata.to_pickle(out_file)

    return plate_metadata




[docs]
def get_image_metadata(metadata_file: Union[str, Path], out_file: Union[str, Path, None] = None) -> pd.DataFrame:
    """Extracts image metadata from the operetta xml file.

    Parameters
    ----------
    metadata_file
        path to the xml metadata file
    out_file
        enter a file path if this dataframe should be written to file
        (possible extensions are .csv or .pkl)

    Returns
    -------
    pandas.DataFrame
        Dimensions (n_rows = length(xmls), n_cols = # xml fields)
    """
    # define xml namespace
    ns = {"harmony": "http://www.perkinelmer.com/PEHH/HarmonyV5"}

    # get xml
    metadata_xml = ET.parse(metadata_file).getroot()
    images_xml = metadata_xml.find("harmony:Images", namespaces=ns)
    if isinstance(images_xml, ET.Element):
        images_xml = images_xml.findall("harmony:Image", namespaces=ns)
    else:
        logger.error("Operetta image metadata XML not parsed correctly")
        os._exit(1)

    # convert to dataframe
    image_metadata = _xml_to_df(images_xml, "harmony", ns)

    # add the field index
    # add field indices using standard order (top-left to bottom right, incrementing columns first)
    image_metadata["XCoordinate"] = (image_metadata["PositionX"].astype("float") * 1e9).astype("int")
    image_metadata["YCoordinate"] = (image_metadata["PositionY"].astype("float") * 1e9).astype("int")
    image_metadata["XYCoordinates"] = image_metadata[["XCoordinate", "YCoordinate"]].apply(tuple, axis=1)

    # Number fields from top-left to bottom-right (increase x first)
    unique_int_coords_sorted = sorted(list(set(image_metadata["XYCoordinates"])), key=lambda k: [-k[1], k[0]])
    coord_index = dict(
        zip(
            unique_int_coords_sorted,
            ["%0d" % i for i in range(1, len(unique_int_coords_sorted) + 1)],
        )
    )

    # keep this as StandardFieldID
    image_metadata["StandardFieldID"] = image_metadata["XYCoordinates"].map(coord_index)
    image_metadata = image_metadata.drop(columns=["XYCoordinates", "XCoordinate", "YCoordinate"])

    image_metadata = image_metadata.astype(image_metadata_dtypes)

    # add a "WellName" identifier
    image_metadata["WellName"] = image_metadata[["Row", "Col"]].apply(lambda x: _to_well_name(x.Row, x.Col), axis=1)

    # write file if requested
    if out_file is not None:
        logger.info(f"Save plate metadata to file: {str(out_file)}")
        if Path(out_file).suffix == ".csv":
            image_metadata.to_csv(out_file, index=False)
        elif Path(out_file).suffix == ".pkl":
            image_metadata.to_pickle(out_file)

    print(out_file)
    return image_metadata



def load_image_metadata(metadata_file: Union[str, Path]):
    """Loads image metadata previously saved during image conversion.

    Parameters
    ----------
    metadata_file
        path to the pkl or csv metadata file

    Returns
    -------
    pandas.DataFrame
        Dimensions (n_rows = # fields-of-view, n_cols = # xml fields)
    """
    metadata_file = Path(metadata_file)
    if metadata_file.suffix == ".pkl":
        metadata = pd.read_pickle(metadata_file)
    elif metadata_file.suffix == ".csv":
        metadata = pd.read_csv(metadata_file)
    else:
        logger.error(f"Unknown metadata file: {str(metadata_file)}")
    return metadata