403 lines
15 KiB
Python
403 lines
15 KiB
Python
from __future__ import annotations
|
|
|
|
import re
|
|
import sqlite3
|
|
from contextlib import contextmanager, suppress
|
|
from pathlib import Path, PurePath
|
|
from typing import Any
|
|
|
|
from musicdl.catalogsync.db import connect_database
|
|
|
|
|
|
_COPY_SUFFIX_RE = re.compile(r" \(\d+\)(?=(\.[^.]+)?$)")
|
|
|
|
|
|
class LocalDedupeBlockedError(RuntimeError):
|
|
pass
|
|
|
|
|
|
def _coerce_int(value: Any) -> int | None:
|
|
try:
|
|
return int(value)
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
|
|
def _row_value(row: sqlite3.Row | dict[str, Any], key: str) -> Any:
|
|
if isinstance(row, sqlite3.Row):
|
|
try:
|
|
return row[key]
|
|
except IndexError:
|
|
return None
|
|
return row.get(key)
|
|
|
|
|
|
def _path_for_location(row: sqlite3.Row | dict[str, Any]) -> Path | None:
|
|
absolute_path = str(_row_value(row, "absolute_path") or "").strip()
|
|
if absolute_path:
|
|
return Path(absolute_path)
|
|
base_path = str(_row_value(row, "base_path") or "").strip()
|
|
locator = str(_row_value(row, "locator") or "").strip()
|
|
if not base_path or not locator:
|
|
return None
|
|
return Path(base_path) / locator
|
|
|
|
|
|
def _resolved_path(path: Path | None) -> Path | None:
|
|
if path is None:
|
|
return None
|
|
with suppress(OSError, RuntimeError):
|
|
return path.resolve(strict=False)
|
|
return path
|
|
|
|
|
|
def _paths_match(left: Path | None, right: Path | None) -> bool:
|
|
if left is None or right is None:
|
|
return False
|
|
return _resolved_path(left) == _resolved_path(right)
|
|
|
|
|
|
def _has_copy_suffix(locator: str | None) -> bool:
|
|
return bool(_COPY_SUFFIX_RE.search(PurePath(str(locator or "")).name))
|
|
|
|
|
|
def _location_payload(row: sqlite3.Row | dict[str, Any]) -> dict[str, Any]:
|
|
path = _path_for_location(row)
|
|
file_exists = bool(path and path.exists())
|
|
actual_file_size_bytes = None
|
|
if file_exists and path is not None:
|
|
with suppress(OSError):
|
|
actual_file_size_bytes = int(path.stat().st_size)
|
|
return {
|
|
"id": int(row["location_id"]),
|
|
"file_asset_id": int(row["file_asset_id"]),
|
|
"song_id": int(row["song_id"]),
|
|
"backend_id": int(row["backend_id"]),
|
|
"backend_name": str(row["backend_name"] or ""),
|
|
"locator": str(row["locator"] or ""),
|
|
"absolute_path": str(row["absolute_path"] or ""),
|
|
"file_exists": file_exists,
|
|
"file_size_bytes": _coerce_int(row["file_size_bytes"]),
|
|
"actual_file_size_bytes": actual_file_size_bytes,
|
|
"song_name": str(row["song_name"] or ""),
|
|
"singers": str(row["singers"] or ""),
|
|
"_path": path,
|
|
}
|
|
|
|
|
|
def _location_sort_key(location: dict[str, Any]) -> tuple[int, int, int, int]:
|
|
return (
|
|
0 if location["file_exists"] else 1,
|
|
0 if not _has_copy_suffix(location["locator"]) else 1,
|
|
len(location["locator"]),
|
|
int(location["id"]),
|
|
)
|
|
|
|
|
|
def _duplicate_size_bytes(location: dict[str, Any]) -> int:
|
|
size_value = location.get("actual_file_size_bytes")
|
|
if size_value is None:
|
|
size_value = location.get("file_size_bytes")
|
|
return max(int(size_value or 0), 0)
|
|
|
|
|
|
class LocalMaintenanceService:
|
|
def __init__(self, db_path: str | Path):
|
|
self.db_path = Path(db_path)
|
|
|
|
def _connect(self) -> sqlite3.Connection:
|
|
return connect_database(self.db_path)
|
|
|
|
@contextmanager
|
|
def _connection(self):
|
|
conn = self._connect()
|
|
try:
|
|
yield conn
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
def scan_local_duplicates(self, *, sample_limit: int = 20) -> dict[str, Any]:
|
|
with self._connection() as conn:
|
|
groups = self._load_duplicate_groups(conn)
|
|
scanned_row = conn.execute(
|
|
"""
|
|
SELECT COUNT(*) AS count_value
|
|
FROM file_locations AS fl
|
|
JOIN storage_backends AS sb ON sb.id = fl.backend_id
|
|
WHERE fl.status = 'active'
|
|
AND sb.backend_type = 'local_fs'
|
|
"""
|
|
).fetchone()
|
|
return self._build_scan_payload(
|
|
groups,
|
|
scanned_active_local_location_count=int(scanned_row["count_value"]) if scanned_row else 0,
|
|
sample_limit=sample_limit,
|
|
)
|
|
|
|
def dedupe_local_duplicates(self, *, sample_limit: int = 20) -> dict[str, Any]:
|
|
with self._connection() as conn:
|
|
self._raise_if_running_work(conn)
|
|
groups = self._load_duplicate_groups(conn)
|
|
execution = {
|
|
"deduped_group_count": 0,
|
|
"inactive_location_count": 0,
|
|
"deleted_file_count": 0,
|
|
"released_bytes": 0,
|
|
"repointed_upload_task_count": 0,
|
|
"repointed_job_item_count": 0,
|
|
}
|
|
affected_pairs: set[tuple[int, int]] = set()
|
|
for group in groups:
|
|
keep = group["keep"]
|
|
duplicates = list(group["duplicates"])
|
|
if not duplicates:
|
|
continue
|
|
execution["deduped_group_count"] += 1
|
|
conn.execute(
|
|
"""
|
|
UPDATE file_locations
|
|
SET
|
|
is_primary = CASE WHEN id = ? THEN 1 ELSE 0 END,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
WHERE file_asset_id = ? AND backend_id = ?
|
|
""",
|
|
(
|
|
int(keep["id"]),
|
|
int(group["file_asset_id"]),
|
|
int(group["backend_id"]),
|
|
),
|
|
)
|
|
for duplicate in duplicates:
|
|
duplicate_id = int(duplicate["id"])
|
|
upload_cursor = conn.execute(
|
|
"""
|
|
UPDATE upload_tasks
|
|
SET
|
|
source_location_id = ?,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
WHERE source_location_id = ?
|
|
""",
|
|
(int(keep["id"]), duplicate_id),
|
|
)
|
|
execution["repointed_upload_task_count"] += max(upload_cursor.rowcount, 0)
|
|
item_cursor = conn.execute(
|
|
"""
|
|
UPDATE job_items
|
|
SET file_location_id = ?
|
|
WHERE file_location_id = ?
|
|
""",
|
|
(int(keep["id"]), duplicate_id),
|
|
)
|
|
execution["repointed_job_item_count"] += max(item_cursor.rowcount, 0)
|
|
inactive_cursor = conn.execute(
|
|
"""
|
|
UPDATE file_locations
|
|
SET
|
|
status = 'inactive',
|
|
is_primary = 0,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
WHERE id = ? AND status = 'active'
|
|
""",
|
|
(duplicate_id,),
|
|
)
|
|
execution["inactive_location_count"] += max(inactive_cursor.rowcount, 0)
|
|
duplicate_path = duplicate["_path"]
|
|
if (
|
|
duplicate_path is not None
|
|
and duplicate_path.exists()
|
|
and not _paths_match(duplicate_path, keep["_path"])
|
|
):
|
|
duplicate_size_bytes = _duplicate_size_bytes(duplicate)
|
|
with suppress(OSError):
|
|
duplicate_path.unlink()
|
|
execution["deleted_file_count"] += 1
|
|
execution["released_bytes"] += duplicate_size_bytes
|
|
affected_pairs.add((int(group["song_id"]), int(group["backend_id"])))
|
|
for song_id, backend_id in affected_pairs:
|
|
self._refresh_song_backend_presence_with_connection(
|
|
conn,
|
|
song_id=song_id,
|
|
backend_id=backend_id,
|
|
)
|
|
payload = self.scan_local_duplicates(sample_limit=sample_limit)
|
|
payload["execution"] = execution
|
|
return payload
|
|
|
|
def _raise_if_running_work(self, conn: sqlite3.Connection) -> None:
|
|
running_jobs_row = conn.execute(
|
|
"SELECT COUNT(*) AS count_value FROM job_runs WHERE status = 'running'"
|
|
).fetchone()
|
|
running_items_row = conn.execute(
|
|
"SELECT COUNT(*) AS count_value FROM job_items WHERE status = 'running'"
|
|
).fetchone()
|
|
running_jobs = int(running_jobs_row["count_value"]) if running_jobs_row else 0
|
|
running_items = int(running_items_row["count_value"]) if running_items_row else 0
|
|
if running_jobs > 0 or running_items > 0:
|
|
raise LocalDedupeBlockedError(
|
|
f"cannot dedupe while jobs or items are running (jobs={running_jobs}, items={running_items})"
|
|
)
|
|
|
|
def _load_duplicate_groups(self, conn: sqlite3.Connection) -> list[dict[str, Any]]:
|
|
rows = conn.execute(
|
|
"""
|
|
WITH duplicate_pairs AS (
|
|
SELECT fl.file_asset_id, fl.backend_id
|
|
FROM file_locations AS fl
|
|
JOIN storage_backends AS sb ON sb.id = fl.backend_id
|
|
WHERE fl.status = 'active'
|
|
AND sb.backend_type = 'local_fs'
|
|
GROUP BY fl.file_asset_id, fl.backend_id
|
|
HAVING COUNT(*) > 1
|
|
)
|
|
SELECT
|
|
fl.id AS location_id,
|
|
fl.file_asset_id,
|
|
fa.song_id,
|
|
fl.backend_id,
|
|
sb.name AS backend_name,
|
|
sb.base_path,
|
|
fl.locator,
|
|
fl.absolute_path,
|
|
COALESCE(fa.file_size_bytes, s.file_size_bytes) AS file_size_bytes,
|
|
s.name AS song_name,
|
|
s.singers
|
|
FROM file_locations AS fl
|
|
JOIN duplicate_pairs AS dp
|
|
ON dp.file_asset_id = fl.file_asset_id
|
|
AND dp.backend_id = fl.backend_id
|
|
JOIN file_assets AS fa ON fa.id = fl.file_asset_id
|
|
JOIN songs AS s ON s.id = fa.song_id
|
|
JOIN storage_backends AS sb ON sb.id = fl.backend_id
|
|
WHERE fl.status = 'active'
|
|
ORDER BY fl.file_asset_id ASC, fl.backend_id ASC, fl.id ASC
|
|
"""
|
|
).fetchall()
|
|
grouped: dict[tuple[int, int], list[dict[str, Any]]] = {}
|
|
for row in rows:
|
|
location = _location_payload(row)
|
|
key = (int(location["file_asset_id"]), int(location["backend_id"]))
|
|
grouped.setdefault(key, []).append(location)
|
|
|
|
groups: list[dict[str, Any]] = []
|
|
for (file_asset_id, backend_id), locations in grouped.items():
|
|
ordered_locations = sorted(locations, key=_location_sort_key)
|
|
keep = ordered_locations[0]
|
|
groups.append(
|
|
{
|
|
"file_asset_id": int(file_asset_id),
|
|
"backend_id": int(backend_id),
|
|
"backend_name": keep["backend_name"],
|
|
"song_id": int(keep["song_id"]),
|
|
"song_name": keep["song_name"],
|
|
"singers": keep["singers"],
|
|
"keep": keep,
|
|
"duplicates": ordered_locations[1:],
|
|
}
|
|
)
|
|
groups.sort(
|
|
key=lambda group: (
|
|
int(group["song_id"]),
|
|
int(group["file_asset_id"]),
|
|
int(group["backend_id"]),
|
|
)
|
|
)
|
|
return groups
|
|
|
|
def _build_scan_payload(
|
|
self,
|
|
groups: list[dict[str, Any]],
|
|
*,
|
|
scanned_active_local_location_count: int,
|
|
sample_limit: int,
|
|
) -> dict[str, Any]:
|
|
normalized_sample_limit = max(int(sample_limit or 20), 1)
|
|
return {
|
|
"summary": {
|
|
"duplicate_group_count": len(groups),
|
|
"duplicate_location_count": sum(len(group["duplicates"]) for group in groups),
|
|
"duplicate_file_size_bytes": sum(
|
|
_duplicate_size_bytes(location)
|
|
for group in groups
|
|
for location in group["duplicates"]
|
|
),
|
|
"scanned_active_local_location_count": int(scanned_active_local_location_count),
|
|
},
|
|
"groups": [self._serialize_group(group) for group in groups[:normalized_sample_limit]],
|
|
}
|
|
|
|
@staticmethod
|
|
def _serialize_group(group: dict[str, Any]) -> dict[str, Any]:
|
|
return {
|
|
"file_asset_id": int(group["file_asset_id"]),
|
|
"backend_id": int(group["backend_id"]),
|
|
"backend_name": str(group["backend_name"]),
|
|
"song_id": int(group["song_id"]),
|
|
"song_name": str(group["song_name"]),
|
|
"singers": str(group["singers"]),
|
|
"keep": LocalMaintenanceService._serialize_location(group["keep"]),
|
|
"duplicates": [
|
|
LocalMaintenanceService._serialize_location(location)
|
|
for location in group["duplicates"]
|
|
],
|
|
}
|
|
|
|
@staticmethod
|
|
def _serialize_location(location: dict[str, Any]) -> dict[str, Any]:
|
|
return {
|
|
"id": int(location["id"]),
|
|
"locator": str(location["locator"]),
|
|
"absolute_path": str(location["absolute_path"]),
|
|
"file_exists": bool(location["file_exists"]),
|
|
"file_size_bytes": _coerce_int(location["file_size_bytes"]),
|
|
"actual_file_size_bytes": _coerce_int(location["actual_file_size_bytes"]),
|
|
}
|
|
|
|
@staticmethod
|
|
def _refresh_song_backend_presence_with_connection(
|
|
conn: sqlite3.Connection,
|
|
*,
|
|
song_id: int,
|
|
backend_id: int,
|
|
) -> None:
|
|
summary = conn.execute(
|
|
"""
|
|
SELECT
|
|
COUNT(*) AS active_file_count,
|
|
MIN(fl.id) AS primary_file_location_id
|
|
FROM file_locations AS fl
|
|
JOIN file_assets AS fa ON fa.id = fl.file_asset_id
|
|
WHERE fa.song_id = ?
|
|
AND fl.backend_id = ?
|
|
AND fl.status = 'active'
|
|
""",
|
|
(int(song_id), int(backend_id)),
|
|
).fetchone()
|
|
active_file_count = int(summary["active_file_count"]) if summary else 0
|
|
has_active_file = 1 if active_file_count > 0 else 0
|
|
primary_file_location_id = summary["primary_file_location_id"] if summary else None
|
|
conn.execute(
|
|
"""
|
|
INSERT INTO song_backend_presence (
|
|
song_id,
|
|
backend_id,
|
|
has_active_file,
|
|
active_file_count,
|
|
primary_file_location_id
|
|
)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
ON CONFLICT(song_id, backend_id) DO UPDATE SET
|
|
has_active_file = excluded.has_active_file,
|
|
active_file_count = excluded.active_file_count,
|
|
primary_file_location_id = excluded.primary_file_location_id,
|
|
updated_at = CURRENT_TIMESTAMP
|
|
""",
|
|
(
|
|
int(song_id),
|
|
int(backend_id),
|
|
has_active_file,
|
|
active_file_count,
|
|
primary_file_location_id,
|
|
),
|
|
)
|