Source code for blimp.preprocessing.operetta_parse_metadata
"""Copyright 2023 (C) University of New South Wales Original author:
Scott Berry <scott.berry@unsw.edu.au>
"""
from typing import Union
from pathlib import Path
import os
import logging
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd
logger = logging.getLogger(__name__)
image_metadata_dtypes = {
"id": str,
"State": str,
"URL": str,
"Row": np.uint8,
"Col": np.uint8,
"FieldID": np.uint16,
"PlaneID": np.uint16,
"TimepointID": np.int64,
"ChannelID": np.uint8,
"FlimID": np.uint8,
"ChannelName": str,
"ImageType": str,
"AcquisitionType": str,
"IlluminationType": str,
"ChannelType": str,
"ImageResolutionX": float,
"ImageResolutionY": float,
"ImageSizeX": np.uint16,
"ImageSizeY": np.uint16,
"BinningX": np.uint8,
"BinningY": np.uint8,
"MaxIntensity": int,
"CameraType": str,
"PositionX": float,
"PositionY": float,
"PositionZ": float,
"AbsPositionZ": float,
"MeasurementTimeOffset": float,
"AbsTime": "datetime64[ns]",
"MainExcitationWavelength": np.uint16,
"MainEmissionWavelength": np.uint16,
"ObjectiveMagnification": np.uint8,
"ObjectiveNA": float,
"ExposureTime": float,
"OrientationMatrix": str,
"StandardFieldID": np.uint16,
}
def _remove_ns(s: str) -> str:
"""Strip text before "}".
Parameters
----------
s : string
string to be modified
Returns
str
string with text up to and including "}" removed
Examples
--------
>>> _remove_ns("{http://www.perkinelmer.com/PEHH/HarmonyV5}Plates")
'Plates'
"""
return s.split("}")[1][0:]
def _xml_to_df(xmls, ns_key: str, ns_dict: dict) -> pd.DataFrame:
"""Convert a list of xmls into a pandas dataframe, where each xml forms a
row.
Parameters
----------
xmls
XML element tree to parse
ns_key
namespace key to use
ns_dict
namespace dictionary
Returns
-------
pandas.DataFrame
Dimensions (n_rows = length(xmls), n_cols = # fields)
Examples
--------
>>> idx_path = image_dir / 'Images' / 'Index.idx.xml'
>>> idx_xml = ET.parse(idx_path).getroot()
>>> ns = {'harmony': "http://www.perkinelmer.com/PEHH/HarmonyV5"}
>>> plates_xml = idx_xml.find('harmony:Plates',namespaces=ns).findall('harmony:Plate',namespaces=ns)
>>> _xml_to_df(plates_xml,"harmony",ns)
"""
metadata = []
for xml in xmls:
# get column names from tags
xml_tags = [_remove_ns(field.tag) for field in xml]
# read metadata using a dict comprehension
metadata.append({xml_tag: xml.find(ns_key + ":" + xml_tag, namespaces=ns_dict).text for xml_tag in xml_tags})
# convert to dataframe
return pd.DataFrame(metadata)
def _to_well_name(row: int, column: int) -> str:
"""Convert row and column numbers to well name.
Parameters
----------
row : int
Row of the plate
column : int
Column of the plate
Returns
-------
str
Well name
Examples
--------
>>> _to_well_name(1,13)
A13
"""
return chr(96 + row).upper() + "%0.2d" % column
[docs]
def get_plate_metadata(metadata_file: Union[str, Path], out_file: Union[str, Path, None] = None) -> pd.DataFrame:
"""Extracts plate metadata from the operetta xml file.
Parameters
----------
metadata_file
path to the xml metadata file
out_file
enter a file path if this dataframe should be written to file
(possible extensions are .csv or .pkl)
Returns
-------
pandas.DataFrame
Dimensions (n_rows = length(xmls), n_cols = # xml fields)
"""
# define xml namespace
ns = {"harmony": "http://www.perkinelmer.com/PEHH/HarmonyV5"}
# get xml
metadata_xml = ET.parse(metadata_file).getroot()
plates_xml = metadata_xml.find("harmony:Plates", namespaces=ns)
if isinstance(plates_xml, ET.Element):
plates_xml = plates_xml.findall("harmony:Plate", namespaces=ns)
else:
logger.error("Operetta plate metadata XML not parsed correctly")
os._exit(1)
# convert to dataframe
plate_metadata = _xml_to_df(plates_xml, "harmony", ns)
# write file if requested
if out_file is not None:
logger.info(f"Save plate metadata to file: {str(out_file)}")
if Path(out_file).suffix == ".csv":
plate_metadata.to_csv(out_file, index=False)
elif Path(out_file).suffix == ".pkl":
plate_metadata.to_pickle(out_file)
return plate_metadata
[docs]
def get_image_metadata(metadata_file: Union[str, Path], out_file: Union[str, Path, None] = None) -> pd.DataFrame:
"""Extracts image metadata from the operetta xml file.
Parameters
----------
metadata_file
path to the xml metadata file
out_file
enter a file path if this dataframe should be written to file
(possible extensions are .csv or .pkl)
Returns
-------
pandas.DataFrame
Dimensions (n_rows = length(xmls), n_cols = # xml fields)
"""
# define xml namespace
ns = {"harmony": "http://www.perkinelmer.com/PEHH/HarmonyV5"}
# get xml
metadata_xml = ET.parse(metadata_file).getroot()
images_xml = metadata_xml.find("harmony:Images", namespaces=ns)
if isinstance(images_xml, ET.Element):
images_xml = images_xml.findall("harmony:Image", namespaces=ns)
else:
logger.error("Operetta image metadata XML not parsed correctly")
os._exit(1)
# convert to dataframe
image_metadata = _xml_to_df(images_xml, "harmony", ns)
# add the field index
# add field indices using standard order (top-left to bottom right, incrementing columns first)
image_metadata["XCoordinate"] = (image_metadata["PositionX"].astype("float") * 1e9).astype("int")
image_metadata["YCoordinate"] = (image_metadata["PositionY"].astype("float") * 1e9).astype("int")
image_metadata["XYCoordinates"] = image_metadata[["XCoordinate", "YCoordinate"]].apply(tuple, axis=1)
# Number fields from top-left to bottom-right (increase x first)
unique_int_coords_sorted = sorted(list(set(image_metadata["XYCoordinates"])), key=lambda k: [-k[1], k[0]])
coord_index = dict(
zip(
unique_int_coords_sorted,
["%0d" % i for i in range(1, len(unique_int_coords_sorted) + 1)],
)
)
# keep this as StandardFieldID
image_metadata["StandardFieldID"] = image_metadata["XYCoordinates"].map(coord_index)
image_metadata = image_metadata.drop(columns=["XYCoordinates", "XCoordinate", "YCoordinate"])
image_metadata = image_metadata.astype(image_metadata_dtypes)
# add a "WellName" identifier
image_metadata["WellName"] = image_metadata[["Row", "Col"]].apply(lambda x: _to_well_name(x.Row, x.Col), axis=1)
# write file if requested
if out_file is not None:
logger.info(f"Save plate metadata to file: {str(out_file)}")
if Path(out_file).suffix == ".csv":
image_metadata.to_csv(out_file, index=False)
elif Path(out_file).suffix == ".pkl":
image_metadata.to_pickle(out_file)
print(out_file)
return image_metadata
def load_image_metadata(metadata_file: Union[str, Path]):
"""Loads image metadata previously saved during image conversion.
Parameters
----------
metadata_file
path to the pkl or csv metadata file
Returns
-------
pandas.DataFrame
Dimensions (n_rows = # fields-of-view, n_cols = # xml fields)
"""
metadata_file = Path(metadata_file)
if metadata_file.suffix == ".pkl":
metadata = pd.read_pickle(metadata_file)
elif metadata_file.suffix == ".csv":
metadata = pd.read_csv(metadata_file)
else:
logger.error(f"Unknown metadata file: {str(metadata_file)}")
return metadata