Source code for miranda.archive._groupings

from __future__ import annotations

import logging
import re
from logging.config import dictConfig
from pathlib import Path
from types import GeneratorType
from typing import Dict, List

from miranda.scripting import LOGGING_CONFIG
from import report_file_size

Nested_List = List[List[Path]]
PathDict = Dict[str, List[Path]]

GiB = int(pow(2, 30))

__all__ = [

[docs]def group_by_length( files: GeneratorType | list[str | Path], size: int = 10, sort: bool = False, ) -> list[list[Path]]: """Group files by an arbitrary number of file entries. Parameters ---------- files: GeneratorType or list of str or pathlib.Path size: int sort: bool Returns ------- list[list[pathlib.Path]] """"Creating groups of {size} files") if sort: files = [Path(f) for f in files] files.sort() grouped_list = list() group = list() for i, f in enumerate(files): group.append(Path(f)) if (i + 1) % size == 0: grouped_list.append(group.copy()) group.clear() continue if not group: pass else: grouped_list.append(group.copy())"Divided files into {len(grouped_list)} groups.") return grouped_list
[docs]def group_by_deciphered_date( files: GeneratorType | list[str | Path], ) -> dict[str, list[Path]]: """Find a common date and groups files based on year and month. Parameters ---------- files: GeneratorType or list of str or pathlib.Path Returns ------- dict[str, list[pathlib.Path]] """ logging.warning("This function doesn't work well with multi-thread processing!")"Creating files from deciphered dates.") year_month_day = re.compile( r"(?P<year>\d{4})-?(?P<month>\d{2})-?(?P<day>\d{2})?.*\.(?P<suffix>nc|zarr)$" ) files = [Path(f) for f in files] files.sort() dates = dict() total = 0 for f in files: match =, str(Path(f).name)) if"day"): key = "-".join(["year"),"month")]) dates.setdefault(key, list()).append(Path(f)) total += 1 elif"month"): key ="year") dates.setdefault(key, list()).append(Path(f)) total += 1 else: continue if dates and total == len(files): f"All files have been grouped by date. {len(dates)} groups created." ) return dict(dates) if dates and total != len(files): "Not all files were successfully grouped by date. Grouping aborted." ) else:"No matches for dates found. Grouping aborted.") return dict(data=files)
[docs]def group_by_size( files: GeneratorType | list[str | Path], size: int = 10 * GiB ) -> list[list[Path]]: """Group files up until a desired size and save it as a grouping within a list. Parameters ---------- files : GeneratorType or list of str or pathlib.Path size : int Returns ------- list[list[pathlib.Path]] """ f"Creating groups of files based on size not exceeding: {report_file_size(size)}." ) files = [Path(f) for f in files] files.sort() grouped_list = list() group = list() total = 0 for f in files: total += Path.stat(f).st_size group.append(f) if total > size: grouped_list.append(group.copy()) group.clear() total = 0 if not group:"The final group is empty. Skipping this set...") else: grouped_list.append(group.copy()) return grouped_list
[docs]def group_by_subdirectories( files: GeneratorType | list[str | Path], within: str | Path = None ) -> dict[str, list[Path]]: """Group files based on the parent folder that they are located within. Parameters ---------- files : GeneratorType or list of str or pathlib.Path within : str or pathlib.Path Returns ------- dict[str, list[pathlib.Path]] """ if not within: within = Path.cwd() files = [Path(f) for f in files] files.sort() groups = dict() for f in files: group_name = Path(f).relative_to(within).parent groups.setdefault(group_name, list()).append(f) f"File subdirectories found. Proceeding with: `{', '.join([str(key) for key in groups.keys()])}`." ) return groups