from __future__ import annotations import re import sqlite3 from contextlib import contextmanager, suppress from pathlib import Path, PurePath from typing import Any from musicdl.catalogsync.db import connect_database _COPY_SUFFIX_RE = re.compile(r" \(\d+\)(?=(\.[^.]+)?$)") class LocalDedupeBlockedError(RuntimeError): pass def _coerce_int(value: Any) -> int | None: try: return int(value) except (TypeError, ValueError): return None def _row_value(row: sqlite3.Row | dict[str, Any], key: str) -> Any: if isinstance(row, sqlite3.Row): try: return row[key] except IndexError: return None return row.get(key) def _path_for_location(row: sqlite3.Row | dict[str, Any]) -> Path | None: absolute_path = str(_row_value(row, "absolute_path") or "").strip() if absolute_path: return Path(absolute_path) base_path = str(_row_value(row, "base_path") or "").strip() locator = str(_row_value(row, "locator") or "").strip() if not base_path or not locator: return None return Path(base_path) / locator def _resolved_path(path: Path | None) -> Path | None: if path is None: return None with suppress(OSError, RuntimeError): return path.resolve(strict=False) return path def _paths_match(left: Path | None, right: Path | None) -> bool: if left is None or right is None: return False return _resolved_path(left) == _resolved_path(right) def _has_copy_suffix(locator: str | None) -> bool: return bool(_COPY_SUFFIX_RE.search(PurePath(str(locator or "")).name)) def _location_payload(row: sqlite3.Row | dict[str, Any]) -> dict[str, Any]: path = _path_for_location(row) file_exists = bool(path and path.exists()) actual_file_size_bytes = None if file_exists and path is not None: with suppress(OSError): actual_file_size_bytes = int(path.stat().st_size) return { "id": int(row["location_id"]), "file_asset_id": int(row["file_asset_id"]), "song_id": int(row["song_id"]), "backend_id": int(row["backend_id"]), "backend_name": str(row["backend_name"] or ""), "locator": str(row["locator"] or ""), "absolute_path": str(row["absolute_path"] or ""), "file_exists": file_exists, "file_size_bytes": _coerce_int(row["file_size_bytes"]), "actual_file_size_bytes": actual_file_size_bytes, "song_name": str(row["song_name"] or ""), "singers": str(row["singers"] or ""), "_path": path, } def _location_sort_key(location: dict[str, Any]) -> tuple[int, int, int, int]: return ( 0 if location["file_exists"] else 1, 0 if not _has_copy_suffix(location["locator"]) else 1, len(location["locator"]), int(location["id"]), ) def _duplicate_size_bytes(location: dict[str, Any]) -> int: size_value = location.get("actual_file_size_bytes") if size_value is None: size_value = location.get("file_size_bytes") return max(int(size_value or 0), 0) class LocalMaintenanceService: def __init__(self, db_path: str | Path): self.db_path = Path(db_path) def _connect(self) -> sqlite3.Connection: return connect_database(self.db_path) @contextmanager def _connection(self): conn = self._connect() try: yield conn conn.commit() finally: conn.close() def scan_local_duplicates(self, *, sample_limit: int = 20) -> dict[str, Any]: with self._connection() as conn: groups = self._load_duplicate_groups(conn) scanned_row = conn.execute( """ SELECT COUNT(*) AS count_value FROM file_locations AS fl JOIN storage_backends AS sb ON sb.id = fl.backend_id WHERE fl.status = 'active' AND sb.backend_type = 'local_fs' """ ).fetchone() return self._build_scan_payload( groups, scanned_active_local_location_count=int(scanned_row["count_value"]) if scanned_row else 0, sample_limit=sample_limit, ) def dedupe_local_duplicates(self, *, sample_limit: int = 20) -> dict[str, Any]: with self._connection() as conn: self._raise_if_running_work(conn) groups = self._load_duplicate_groups(conn) execution = { "deduped_group_count": 0, "inactive_location_count": 0, "deleted_file_count": 0, "released_bytes": 0, "repointed_upload_task_count": 0, "repointed_job_item_count": 0, } affected_pairs: set[tuple[int, int]] = set() for group in groups: keep = group["keep"] duplicates = list(group["duplicates"]) if not duplicates: continue execution["deduped_group_count"] += 1 conn.execute( """ UPDATE file_locations SET is_primary = CASE WHEN id = ? THEN 1 ELSE 0 END, updated_at = CURRENT_TIMESTAMP WHERE file_asset_id = ? AND backend_id = ? """, ( int(keep["id"]), int(group["file_asset_id"]), int(group["backend_id"]), ), ) for duplicate in duplicates: duplicate_id = int(duplicate["id"]) upload_cursor = conn.execute( """ UPDATE upload_tasks SET source_location_id = ?, updated_at = CURRENT_TIMESTAMP WHERE source_location_id = ? """, (int(keep["id"]), duplicate_id), ) execution["repointed_upload_task_count"] += max(upload_cursor.rowcount, 0) item_cursor = conn.execute( """ UPDATE job_items SET file_location_id = ? WHERE file_location_id = ? """, (int(keep["id"]), duplicate_id), ) execution["repointed_job_item_count"] += max(item_cursor.rowcount, 0) inactive_cursor = conn.execute( """ UPDATE file_locations SET status = 'inactive', is_primary = 0, updated_at = CURRENT_TIMESTAMP WHERE id = ? AND status = 'active' """, (duplicate_id,), ) execution["inactive_location_count"] += max(inactive_cursor.rowcount, 0) duplicate_path = duplicate["_path"] if ( duplicate_path is not None and duplicate_path.exists() and not _paths_match(duplicate_path, keep["_path"]) ): duplicate_size_bytes = _duplicate_size_bytes(duplicate) with suppress(OSError): duplicate_path.unlink() execution["deleted_file_count"] += 1 execution["released_bytes"] += duplicate_size_bytes affected_pairs.add((int(group["song_id"]), int(group["backend_id"]))) for song_id, backend_id in affected_pairs: self._refresh_song_backend_presence_with_connection( conn, song_id=song_id, backend_id=backend_id, ) payload = self.scan_local_duplicates(sample_limit=sample_limit) payload["execution"] = execution return payload def _raise_if_running_work(self, conn: sqlite3.Connection) -> None: running_jobs_row = conn.execute( "SELECT COUNT(*) AS count_value FROM job_runs WHERE status = 'running'" ).fetchone() running_items_row = conn.execute( "SELECT COUNT(*) AS count_value FROM job_items WHERE status = 'running'" ).fetchone() running_jobs = int(running_jobs_row["count_value"]) if running_jobs_row else 0 running_items = int(running_items_row["count_value"]) if running_items_row else 0 if running_jobs > 0 or running_items > 0: raise LocalDedupeBlockedError( f"cannot dedupe while jobs or items are running (jobs={running_jobs}, items={running_items})" ) def _load_duplicate_groups(self, conn: sqlite3.Connection) -> list[dict[str, Any]]: rows = conn.execute( """ WITH duplicate_pairs AS ( SELECT fl.file_asset_id, fl.backend_id FROM file_locations AS fl JOIN storage_backends AS sb ON sb.id = fl.backend_id WHERE fl.status = 'active' AND sb.backend_type = 'local_fs' GROUP BY fl.file_asset_id, fl.backend_id HAVING COUNT(*) > 1 ) SELECT fl.id AS location_id, fl.file_asset_id, fa.song_id, fl.backend_id, sb.name AS backend_name, sb.base_path, fl.locator, fl.absolute_path, COALESCE(fa.file_size_bytes, s.file_size_bytes) AS file_size_bytes, s.name AS song_name, s.singers FROM file_locations AS fl JOIN duplicate_pairs AS dp ON dp.file_asset_id = fl.file_asset_id AND dp.backend_id = fl.backend_id JOIN file_assets AS fa ON fa.id = fl.file_asset_id JOIN songs AS s ON s.id = fa.song_id JOIN storage_backends AS sb ON sb.id = fl.backend_id WHERE fl.status = 'active' ORDER BY fl.file_asset_id ASC, fl.backend_id ASC, fl.id ASC """ ).fetchall() grouped: dict[tuple[int, int], list[dict[str, Any]]] = {} for row in rows: location = _location_payload(row) key = (int(location["file_asset_id"]), int(location["backend_id"])) grouped.setdefault(key, []).append(location) groups: list[dict[str, Any]] = [] for (file_asset_id, backend_id), locations in grouped.items(): ordered_locations = sorted(locations, key=_location_sort_key) keep = ordered_locations[0] groups.append( { "file_asset_id": int(file_asset_id), "backend_id": int(backend_id), "backend_name": keep["backend_name"], "song_id": int(keep["song_id"]), "song_name": keep["song_name"], "singers": keep["singers"], "keep": keep, "duplicates": ordered_locations[1:], } ) groups.sort( key=lambda group: ( int(group["song_id"]), int(group["file_asset_id"]), int(group["backend_id"]), ) ) return groups def _build_scan_payload( self, groups: list[dict[str, Any]], *, scanned_active_local_location_count: int, sample_limit: int, ) -> dict[str, Any]: normalized_sample_limit = max(int(sample_limit or 20), 1) return { "summary": { "duplicate_group_count": len(groups), "duplicate_location_count": sum(len(group["duplicates"]) for group in groups), "duplicate_file_size_bytes": sum( _duplicate_size_bytes(location) for group in groups for location in group["duplicates"] ), "scanned_active_local_location_count": int(scanned_active_local_location_count), }, "groups": [self._serialize_group(group) for group in groups[:normalized_sample_limit]], } @staticmethod def _serialize_group(group: dict[str, Any]) -> dict[str, Any]: return { "file_asset_id": int(group["file_asset_id"]), "backend_id": int(group["backend_id"]), "backend_name": str(group["backend_name"]), "song_id": int(group["song_id"]), "song_name": str(group["song_name"]), "singers": str(group["singers"]), "keep": LocalMaintenanceService._serialize_location(group["keep"]), "duplicates": [ LocalMaintenanceService._serialize_location(location) for location in group["duplicates"] ], } @staticmethod def _serialize_location(location: dict[str, Any]) -> dict[str, Any]: return { "id": int(location["id"]), "locator": str(location["locator"]), "absolute_path": str(location["absolute_path"]), "file_exists": bool(location["file_exists"]), "file_size_bytes": _coerce_int(location["file_size_bytes"]), "actual_file_size_bytes": _coerce_int(location["actual_file_size_bytes"]), } @staticmethod def _refresh_song_backend_presence_with_connection( conn: sqlite3.Connection, *, song_id: int, backend_id: int, ) -> None: summary = conn.execute( """ SELECT COUNT(*) AS active_file_count, MIN(fl.id) AS primary_file_location_id FROM file_locations AS fl JOIN file_assets AS fa ON fa.id = fl.file_asset_id WHERE fa.song_id = ? AND fl.backend_id = ? AND fl.status = 'active' """, (int(song_id), int(backend_id)), ).fetchone() active_file_count = int(summary["active_file_count"]) if summary else 0 has_active_file = 1 if active_file_count > 0 else 0 primary_file_location_id = summary["primary_file_location_id"] if summary else None conn.execute( """ INSERT INTO song_backend_presence ( song_id, backend_id, has_active_file, active_file_count, primary_file_location_id ) VALUES (?, ?, ?, ?, ?) ON CONFLICT(song_id, backend_id) DO UPDATE SET has_active_file = excluded.has_active_file, active_file_count = excluded.active_file_count, primary_file_location_id = excluded.primary_file_location_id, updated_at = CURRENT_TIMESTAMP """, ( int(song_id), int(backend_id), has_active_file, active_file_count, primary_file_location_id, ), )