Source code for glasscut.storage.manager

"""Storage management for dataset generation outputs."""

from datetime import datetime
from pathlib import Path
from typing import cast

import orjson

from .structures import (
    DatasetMetadata,
    JsonValue,
    SlideMetadata,
    TileMetadata,
    dataclass_to_dict,
)


[docs] class StorageOrganizer: """Manage output directory layout and JSON metadata persistence."""
[docs] def __init__(self, output_dir: str | Path) -> None: self.output_dir = Path(output_dir).resolve() self.output_dir.mkdir(parents=True, exist_ok=True)
[docs] def init_dataset(self, dataset_id: str) -> Path: """Create dataset root directory.""" dataset_path = self.output_dir / dataset_id dataset_path.mkdir(parents=True, exist_ok=True) return dataset_path
[docs] def init_slide(self, dataset_id: str, slide_id: str) -> dict[str, Path]: """Create standard directories for one processed slide.""" slide_root = self.output_dir / dataset_id / slide_id directories = { "root": slide_root, "tiles": slide_root / "tiles", "thumbnails": slide_root / "thumbnails", "masks": slide_root / "masks", } for directory in directories.values(): directory.mkdir(parents=True, exist_ok=True) return directories
[docs] def save_dataset_metadata( self, dataset_id: str, metadata: DatasetMetadata, ) -> Path: """Persist top-level metadata as dataset_id/metadata.json.""" metadata_path = self.output_dir / dataset_id / "metadata.json" self._write_json(metadata_path, dataclass_to_dict(metadata)) return metadata_path
[docs] def save_slide_metadata( self, dataset_id: str, slide_id: str, metadata: SlideMetadata, ) -> Path: """Persist per-slide metadata as slide_metadata.json.""" metadata_path = self.output_dir / dataset_id / slide_id / "slide_metadata.json" self._write_json(metadata_path, dataclass_to_dict(metadata)) return metadata_path
[docs] def save_processed_json(self, dataset_id: str, processed_slides: list[str]) -> Path: """Persist processed slide IDs in PathoPatcher-style processed.json.""" processed_path = self.output_dir / dataset_id / "processed.json" payload: dict[str, JsonValue] = { "processed_files": cast( list[JsonValue], [str(name) for name in processed_slides] ), "timestamp": datetime.now().isoformat(), "total": len(processed_slides), } self._write_json(processed_path, payload) return processed_path
[docs] def load_processed_json(self, dataset_id: str) -> list[str]: """Load processed slide IDs from processed.json. Returns an empty list when the file does not exist. """ processed_path = self.output_dir / dataset_id / "processed.json" if not processed_path.exists(): return [] payload = self._read_json(processed_path) if not isinstance(payload, dict): return [] processed = payload.get("processed_files") if not isinstance(processed, list): return [] result: list[str] = [] for value in processed: if isinstance(value, str): result.append(value) return result
[docs] def load_slide_metadata(self, dataset_id: str, slide_id: str) -> SlideMetadata: """Load slide metadata from disk and reconstruct dataclasses.""" metadata_path = self.output_dir / dataset_id / slide_id / "slide_metadata.json" payload = self._read_json(metadata_path) if not isinstance(payload, dict): raise ValueError(f"Invalid slide metadata format: {metadata_path}") raw_tiles = payload.get("tiles", []) if not isinstance(raw_tiles, list): raw_tiles = [] tiles: list[TileMetadata] = [] for raw_tile in raw_tiles: if not isinstance(raw_tile, dict): continue tiles.append( TileMetadata( tile_id=self._as_str(raw_tile.get("tile_id", "")), x=self._as_int(raw_tile.get("x", 0)), y=self._as_int(raw_tile.get("y", 0)), width=self._as_int(raw_tile.get("width", 0)), height=self._as_int(raw_tile.get("height", 0)), level=self._as_int(raw_tile.get("level", 0)), magnification=self._as_float(raw_tile.get("magnification", 0.0)), tissue_ratio=self._as_float(raw_tile.get("tissue_ratio", 0.0)), file_path=self._as_str(raw_tile.get("file_path", "")), ) ) raw_available_mags = payload.get("available_magnifications", []) available_magnifications: list[float] = [] if isinstance(raw_available_mags, list): for value in raw_available_mags: if isinstance(value, (int, float)): available_magnifications.append(float(value)) raw_tile_size = payload.get("tile_size", [0, 0]) tile_size: tuple[int, int] if ( isinstance(raw_tile_size, list) and len(raw_tile_size) == 2 and isinstance(raw_tile_size[0], (int, float)) and isinstance(raw_tile_size[1], (int, float)) ): tile_size = (int(raw_tile_size[0]), int(raw_tile_size[1])) else: tile_size = (0, 0) raw_dimensions = payload.get("dimensions", [0, 0]) dimensions: tuple[int, int] if ( isinstance(raw_dimensions, list) and len(raw_dimensions) == 2 and isinstance(raw_dimensions[0], (int, float)) and isinstance(raw_dimensions[1], (int, float)) ): dimensions = (int(raw_dimensions[0]), int(raw_dimensions[1])) else: dimensions = (0, 0) return SlideMetadata( slide_id=self._as_str(payload.get("slide_id", slide_id)), slide_name=self._as_str(payload.get("slide_name", "")), slide_path=self._as_str(payload.get("slide_path", "")), total_tiles=self._as_int(payload.get("total_tiles", len(tiles))), dimensions=dimensions, mpp=self._as_float(payload.get("mpp", 0.0)), available_magnifications=available_magnifications, tile_size=tile_size, tiler_name=self._as_str(payload.get("tiler_name", "")), timestamp=self._as_str(payload.get("timestamp", "")), tiles=tiles, )
@staticmethod def _write_json(path: Path, payload: JsonValue) -> None: # Keep metadata human-readable while using faster serialization. serialized = orjson.dumps(payload, option=orjson.OPT_INDENT_2) path.write_bytes(serialized) @staticmethod def _read_json(path: Path) -> JsonValue: return cast(JsonValue, orjson.loads(path.read_bytes())) @staticmethod def _as_int(value: JsonValue) -> int: if isinstance(value, bool): return int(value) if isinstance(value, (int, float)): return int(value) return 0 @staticmethod def _as_float(value: JsonValue) -> float: if isinstance(value, bool): return float(value) if isinstance(value, (int, float)): return float(value) return 0.0 @staticmethod def _as_str(value: JsonValue) -> str: if isinstance(value, str): return value return ""