Source code for cellmil.data.storage

# -*- coding: utf-8 -*-
# Storage class to store a processed WSI and its batches
#
# References:
# CellViT: Vision Transformers for precise cell segmentation and classification
# Fabian Hörst et al., Medical Image Analysis, 2024
# DOI: https://doi.org/10.1016/j.media.2024.103143

import json
from json.decoder import JSONDecodeError
from pathlib import Path
from typing import List, Union, Dict, Any

import numpy as np
import yaml
from PIL import Image
from PIL.Image import Image as ImageType


[docs]class Storage:
    """Storage class to store all WSI related files

    Generates the following folder structure for storage:

        * Output-Path/WSI-Name
            * metadata.yaml: Metadata of the WSI
            * annotation_masks: preview images of annotations
            * patches: store extracted patches with each path "wsi_name_row_col.png"
            * metadata: store metadata for each path "wsi_name_row_col.yaml"
            * thumbnails: WSI thumbnails
            * tissue masks: Masks of tissue detection
            * Optional: context: context folder with subfolder for each context scale
            * Optional: masks: Masks for each patch as .npy files (numpy arrays)

    Args:
        wsi_name (str): Name of the WSI, as string. Just the name without suffix and no path!
        output_path (Union[Path, str]): Path to the folder where the resulting dataset should be stored.
        metadata (dict): Metadata of the WSI. Is stored in parent directory
        mask_images (dict[str, Image]): Masks generated during tissue detection stored in dict
            with keys equals the mask name and values equals the PIL image
        mask_images_annotations (dict[str, Image]): Annotation masks for provided annotations for the complete WSI.
            Masks are equal to the tissue masks sizes. Keys are the mask names and values are the PIL images.
        thumbnails (dict[str, Image]): Dictionary with thumbnails and corresponding thumbnail names.
            Names are keys, PIL Images are values
        store_masks (bool, optional): Set to store masks per patch. Defaults to False.
        save_context (bool, optional): If context patches are provided. Defaults to False.
        context_scales (List[int], optional): List with context scales. Defaults to None.
    """

[docs]    def __init__(
        self,
        wsi_name: str,
        output_path: Union[Path, str],
        metadata: dict[str, Any],
        mask_images: dict[str, ImageType],
        mask_images_annotations: dict[str, ImageType],
        thumbnails: dict[str, ImageType],
        store_masks: bool = False,
        save_context: bool = False,
        context_scales: List[int] | None = None,
    ) -> None:
        self.wsi_name = wsi_name
        self.output_path = Path(output_path)
        self.save_context = save_context

        self.wsi_path = self.output_path / self.wsi_name
        self.wsi_path.mkdir(parents=True, exist_ok=True)
        self.patches_path = self.wsi_path / "patches"
        self.patches_path.mkdir(parents=True, exist_ok=True)
        self.patch_metadata_path = self.wsi_path / "metadata"
        self.patch_metadata_path.mkdir(parents=True, exist_ok=True)
        self.thumbnail_path = self.wsi_path / "thumbnails"
        self.thumbnail_path.mkdir(parents=True, exist_ok=True)
        self.tissue_mask_path = self.wsi_path / "tissue_masks"
        self.tissue_mask_path.mkdir(parents=True, exist_ok=True)
        self.annotation_mask_path = self.wsi_path / "annotation_masks"
        self.annotation_mask_path.mkdir(parents=True, exist_ok=True)

        if self.save_context:
            assert context_scales is not None, (
                "Please provide at least one context scale"
            )
            self.context_path = self.wsi_path / "context"
            self.context_path.mkdir(parents=True, exist_ok=True)
            for scale in context_scales:
                (self.context_path / str(scale)).mkdir(parents=True, exist_ok=True)

        self.store_masks = store_masks
        if self.store_masks:
            self.masks_path = self.wsi_path / "masks"
            self.masks_path.mkdir(parents=True, exist_ok=True)

        self.metadata = metadata

        self.save_meta_data()
        self.save_masks(mask_images)
        self.save_annotation_mask(mask_images_annotations)
        self.save_thumbnails(thumbnails)

[docs]    def save_meta_data(self) -> None:
        """
        Store arbitrary meta data in a yaml file on wsi output folder
        """
        # ensure folder exists
        with open(self.wsi_path / "metadata.yaml", "w") as outfile:
            yaml.dump(
                self.metadata,
                outfile,
                sort_keys=False,
                default_flow_style=False,
                allow_unicode=True,
            )

[docs]    def save_masks(self, mask_images: Dict[str, ImageType]):
        """Save tissue masks

        Args:
            mask_images (dict[str, Image]): Masks generated during tissue detection stored in dict
                with keys equals the mask name and values equals the PIL image
        """
        assert "mask" in mask_images.keys()

        for mask_name, mask in mask_images.items():
            mask_path = self.tissue_mask_path / f"{mask_name}.png"
            mask.save(str(mask_path))
        mask_images["mask"].save(self.wsi_path / "mask.png")

[docs]    def save_annotation_mask(self, mask_images_annotations: Dict[str, ImageType]):
        """Save annotation masks

        Args:
            mask_images_annotations (dict[str, Image]): Annotation masks for provided annotations for the complete WSI.
                Masks are equal to the tissue masks sizes. Keys are the mask names and values are the PIL images.
        """
        for mask_name, mask in mask_images_annotations.items():
            mask_path = self.annotation_mask_path / f"{mask_name}.png"
            mask_path_eps = self.annotation_mask_path / f"{mask_name}.eps"
            mask.save(str(mask_path))
            mask.save(str(mask_path_eps))

[docs]    def save_thumbnails(self, thumbnails: Dict[str, ImageType]):
        """Save thumbnails of WSI

        Args:
            thumbnails (dict[str, Image]): Dictionary with thumbnails and corresponding thumbnail names.
                Names are keys, PIL Images are values
        """
        assert "thumbnail" in thumbnails.keys()

        for sample_factor, thumbnail in thumbnails.items():
            thumbnail_path = self.thumbnail_path / f"thumbnail_{sample_factor}.png"
            thumbnail.save(str(thumbnail_path))
        thumbnails["thumbnail"].save(self.wsi_path / "thumbnail.png")

[docs]    def save_elem_to_disk(
        self, patch_result: tuple[np.ndarray[Any, Any], dict[str, Any], None, dict[str, Any]]
    ) -> None:
        patch, patch_metadata, patch_mask, context = patch_result

        row: int = patch_metadata["row"]
        col: int = patch_metadata["col"]
        patch_fname = f"{self.wsi_name}_{row}_{col}.png"
        patch_yaml_name = f"{self.wsi_name}_{row}_{col}.yaml"

        # Save the patch
        Image.fromarray(patch).save(self.patches_path / patch_fname)

        # Save the metadata
        with open(self.patch_metadata_path / patch_yaml_name, "w") as yaml_file:
            yaml.dump(
                patch_metadata, yaml_file, default_flow_style=False, sort_keys=False
            )

        # Save the Mask
        if patch_mask is not None and self.store_masks:
            np.save(
                str(self.masks_path / f"{Path(patch_fname).stem}_mask.npy"),
                patch_mask.squeeze(),
            )

        # Save context patches if non empty
        if self.save_context:
            patch_metadata["context_scales"] = {}
            for scale, context_images in context.items():
                context_name = f"{Path(patch_fname).stem}_context_{scale}.png"
                Image.fromarray(context_images).save(
                    self.context_path / str(scale) / context_name
                )
                patch_metadata["context_scales"][scale] = f"./context/{context_name}"

[docs]    def clean_up(
        self,
        patch_distribution: dict[int, int],
        patch_metadata_list: list[dict[str, dict[str, Any]]],
    ):
        """Clean-Up function, called after WSI has been processed. Appends WSI to `processed.json` file
        and generated a metadata file in root folder called `patch_metadata.json` with merged metadata for all patches
        in one file.

        Args:
            patch_distribution (dict): Patch distrubtion dict. Keys: Lables, values: number of patches in class
            patch_metadata_list (list[dict]): List with all patch metadata to store
        """
        try:
            with open(str(self.output_path / "processed.json"), "r") as processed_list:
                try:
                    processed_files = json.load(processed_list)
                    processed_files["processed_files"].append(self.wsi_name)
                except JSONDecodeError:
                    processed_files = {"processed_files": [self.wsi_name]}
        except FileNotFoundError:
            processed_files = {"processed_files": [self.wsi_name]}
        with open(str(self.output_path / "processed.json"), "w") as processed_list:
            json.dump(processed_files, processed_list, indent=2)

        # count patches per class
        self.metadata["patch_distribution"] = patch_distribution
        self.save_meta_data()

        # save patch metadata file
        with open(self.wsi_path / "patch_metadata.json", "w") as outfile:
            json.dump(patch_metadata_list, outfile, indent=2)