Initial import: Music_Server, MusicFree, catalog-sync
This commit is contained in:
@@ -0,0 +1,402 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import sqlite3
|
||||
from contextlib import contextmanager, suppress
|
||||
from pathlib import Path, PurePath
|
||||
from typing import Any
|
||||
|
||||
from musicdl.catalogsync.db import connect_database
|
||||
|
||||
|
||||
_COPY_SUFFIX_RE = re.compile(r" \(\d+\)(?=(\.[^.]+)?$)")
|
||||
|
||||
|
||||
class LocalDedupeBlockedError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
def _coerce_int(value: Any) -> int | None:
|
||||
try:
|
||||
return int(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def _row_value(row: sqlite3.Row | dict[str, Any], key: str) -> Any:
|
||||
if isinstance(row, sqlite3.Row):
|
||||
try:
|
||||
return row[key]
|
||||
except IndexError:
|
||||
return None
|
||||
return row.get(key)
|
||||
|
||||
|
||||
def _path_for_location(row: sqlite3.Row | dict[str, Any]) -> Path | None:
|
||||
absolute_path = str(_row_value(row, "absolute_path") or "").strip()
|
||||
if absolute_path:
|
||||
return Path(absolute_path)
|
||||
base_path = str(_row_value(row, "base_path") or "").strip()
|
||||
locator = str(_row_value(row, "locator") or "").strip()
|
||||
if not base_path or not locator:
|
||||
return None
|
||||
return Path(base_path) / locator
|
||||
|
||||
|
||||
def _resolved_path(path: Path | None) -> Path | None:
|
||||
if path is None:
|
||||
return None
|
||||
with suppress(OSError, RuntimeError):
|
||||
return path.resolve(strict=False)
|
||||
return path
|
||||
|
||||
|
||||
def _paths_match(left: Path | None, right: Path | None) -> bool:
|
||||
if left is None or right is None:
|
||||
return False
|
||||
return _resolved_path(left) == _resolved_path(right)
|
||||
|
||||
|
||||
def _has_copy_suffix(locator: str | None) -> bool:
|
||||
return bool(_COPY_SUFFIX_RE.search(PurePath(str(locator or "")).name))
|
||||
|
||||
|
||||
def _location_payload(row: sqlite3.Row | dict[str, Any]) -> dict[str, Any]:
|
||||
path = _path_for_location(row)
|
||||
file_exists = bool(path and path.exists())
|
||||
actual_file_size_bytes = None
|
||||
if file_exists and path is not None:
|
||||
with suppress(OSError):
|
||||
actual_file_size_bytes = int(path.stat().st_size)
|
||||
return {
|
||||
"id": int(row["location_id"]),
|
||||
"file_asset_id": int(row["file_asset_id"]),
|
||||
"song_id": int(row["song_id"]),
|
||||
"backend_id": int(row["backend_id"]),
|
||||
"backend_name": str(row["backend_name"] or ""),
|
||||
"locator": str(row["locator"] or ""),
|
||||
"absolute_path": str(row["absolute_path"] or ""),
|
||||
"file_exists": file_exists,
|
||||
"file_size_bytes": _coerce_int(row["file_size_bytes"]),
|
||||
"actual_file_size_bytes": actual_file_size_bytes,
|
||||
"song_name": str(row["song_name"] or ""),
|
||||
"singers": str(row["singers"] or ""),
|
||||
"_path": path,
|
||||
}
|
||||
|
||||
|
||||
def _location_sort_key(location: dict[str, Any]) -> tuple[int, int, int, int]:
|
||||
return (
|
||||
0 if location["file_exists"] else 1,
|
||||
0 if not _has_copy_suffix(location["locator"]) else 1,
|
||||
len(location["locator"]),
|
||||
int(location["id"]),
|
||||
)
|
||||
|
||||
|
||||
def _duplicate_size_bytes(location: dict[str, Any]) -> int:
|
||||
size_value = location.get("actual_file_size_bytes")
|
||||
if size_value is None:
|
||||
size_value = location.get("file_size_bytes")
|
||||
return max(int(size_value or 0), 0)
|
||||
|
||||
|
||||
class LocalMaintenanceService:
|
||||
def __init__(self, db_path: str | Path):
|
||||
self.db_path = Path(db_path)
|
||||
|
||||
def _connect(self) -> sqlite3.Connection:
|
||||
return connect_database(self.db_path)
|
||||
|
||||
@contextmanager
|
||||
def _connection(self):
|
||||
conn = self._connect()
|
||||
try:
|
||||
yield conn
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def scan_local_duplicates(self, *, sample_limit: int = 20) -> dict[str, Any]:
|
||||
with self._connection() as conn:
|
||||
groups = self._load_duplicate_groups(conn)
|
||||
scanned_row = conn.execute(
|
||||
"""
|
||||
SELECT COUNT(*) AS count_value
|
||||
FROM file_locations AS fl
|
||||
JOIN storage_backends AS sb ON sb.id = fl.backend_id
|
||||
WHERE fl.status = 'active'
|
||||
AND sb.backend_type = 'local_fs'
|
||||
"""
|
||||
).fetchone()
|
||||
return self._build_scan_payload(
|
||||
groups,
|
||||
scanned_active_local_location_count=int(scanned_row["count_value"]) if scanned_row else 0,
|
||||
sample_limit=sample_limit,
|
||||
)
|
||||
|
||||
def dedupe_local_duplicates(self, *, sample_limit: int = 20) -> dict[str, Any]:
|
||||
with self._connection() as conn:
|
||||
self._raise_if_running_work(conn)
|
||||
groups = self._load_duplicate_groups(conn)
|
||||
execution = {
|
||||
"deduped_group_count": 0,
|
||||
"inactive_location_count": 0,
|
||||
"deleted_file_count": 0,
|
||||
"released_bytes": 0,
|
||||
"repointed_upload_task_count": 0,
|
||||
"repointed_job_item_count": 0,
|
||||
}
|
||||
affected_pairs: set[tuple[int, int]] = set()
|
||||
for group in groups:
|
||||
keep = group["keep"]
|
||||
duplicates = list(group["duplicates"])
|
||||
if not duplicates:
|
||||
continue
|
||||
execution["deduped_group_count"] += 1
|
||||
conn.execute(
|
||||
"""
|
||||
UPDATE file_locations
|
||||
SET
|
||||
is_primary = CASE WHEN id = ? THEN 1 ELSE 0 END,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE file_asset_id = ? AND backend_id = ?
|
||||
""",
|
||||
(
|
||||
int(keep["id"]),
|
||||
int(group["file_asset_id"]),
|
||||
int(group["backend_id"]),
|
||||
),
|
||||
)
|
||||
for duplicate in duplicates:
|
||||
duplicate_id = int(duplicate["id"])
|
||||
upload_cursor = conn.execute(
|
||||
"""
|
||||
UPDATE upload_tasks
|
||||
SET
|
||||
source_location_id = ?,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE source_location_id = ?
|
||||
""",
|
||||
(int(keep["id"]), duplicate_id),
|
||||
)
|
||||
execution["repointed_upload_task_count"] += max(upload_cursor.rowcount, 0)
|
||||
item_cursor = conn.execute(
|
||||
"""
|
||||
UPDATE job_items
|
||||
SET file_location_id = ?
|
||||
WHERE file_location_id = ?
|
||||
""",
|
||||
(int(keep["id"]), duplicate_id),
|
||||
)
|
||||
execution["repointed_job_item_count"] += max(item_cursor.rowcount, 0)
|
||||
inactive_cursor = conn.execute(
|
||||
"""
|
||||
UPDATE file_locations
|
||||
SET
|
||||
status = 'inactive',
|
||||
is_primary = 0,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
WHERE id = ? AND status = 'active'
|
||||
""",
|
||||
(duplicate_id,),
|
||||
)
|
||||
execution["inactive_location_count"] += max(inactive_cursor.rowcount, 0)
|
||||
duplicate_path = duplicate["_path"]
|
||||
if (
|
||||
duplicate_path is not None
|
||||
and duplicate_path.exists()
|
||||
and not _paths_match(duplicate_path, keep["_path"])
|
||||
):
|
||||
duplicate_size_bytes = _duplicate_size_bytes(duplicate)
|
||||
with suppress(OSError):
|
||||
duplicate_path.unlink()
|
||||
execution["deleted_file_count"] += 1
|
||||
execution["released_bytes"] += duplicate_size_bytes
|
||||
affected_pairs.add((int(group["song_id"]), int(group["backend_id"])))
|
||||
for song_id, backend_id in affected_pairs:
|
||||
self._refresh_song_backend_presence_with_connection(
|
||||
conn,
|
||||
song_id=song_id,
|
||||
backend_id=backend_id,
|
||||
)
|
||||
payload = self.scan_local_duplicates(sample_limit=sample_limit)
|
||||
payload["execution"] = execution
|
||||
return payload
|
||||
|
||||
def _raise_if_running_work(self, conn: sqlite3.Connection) -> None:
|
||||
running_jobs_row = conn.execute(
|
||||
"SELECT COUNT(*) AS count_value FROM job_runs WHERE status = 'running'"
|
||||
).fetchone()
|
||||
running_items_row = conn.execute(
|
||||
"SELECT COUNT(*) AS count_value FROM job_items WHERE status = 'running'"
|
||||
).fetchone()
|
||||
running_jobs = int(running_jobs_row["count_value"]) if running_jobs_row else 0
|
||||
running_items = int(running_items_row["count_value"]) if running_items_row else 0
|
||||
if running_jobs > 0 or running_items > 0:
|
||||
raise LocalDedupeBlockedError(
|
||||
f"cannot dedupe while jobs or items are running (jobs={running_jobs}, items={running_items})"
|
||||
)
|
||||
|
||||
def _load_duplicate_groups(self, conn: sqlite3.Connection) -> list[dict[str, Any]]:
|
||||
rows = conn.execute(
|
||||
"""
|
||||
WITH duplicate_pairs AS (
|
||||
SELECT fl.file_asset_id, fl.backend_id
|
||||
FROM file_locations AS fl
|
||||
JOIN storage_backends AS sb ON sb.id = fl.backend_id
|
||||
WHERE fl.status = 'active'
|
||||
AND sb.backend_type = 'local_fs'
|
||||
GROUP BY fl.file_asset_id, fl.backend_id
|
||||
HAVING COUNT(*) > 1
|
||||
)
|
||||
SELECT
|
||||
fl.id AS location_id,
|
||||
fl.file_asset_id,
|
||||
fa.song_id,
|
||||
fl.backend_id,
|
||||
sb.name AS backend_name,
|
||||
sb.base_path,
|
||||
fl.locator,
|
||||
fl.absolute_path,
|
||||
COALESCE(fa.file_size_bytes, s.file_size_bytes) AS file_size_bytes,
|
||||
s.name AS song_name,
|
||||
s.singers
|
||||
FROM file_locations AS fl
|
||||
JOIN duplicate_pairs AS dp
|
||||
ON dp.file_asset_id = fl.file_asset_id
|
||||
AND dp.backend_id = fl.backend_id
|
||||
JOIN file_assets AS fa ON fa.id = fl.file_asset_id
|
||||
JOIN songs AS s ON s.id = fa.song_id
|
||||
JOIN storage_backends AS sb ON sb.id = fl.backend_id
|
||||
WHERE fl.status = 'active'
|
||||
ORDER BY fl.file_asset_id ASC, fl.backend_id ASC, fl.id ASC
|
||||
"""
|
||||
).fetchall()
|
||||
grouped: dict[tuple[int, int], list[dict[str, Any]]] = {}
|
||||
for row in rows:
|
||||
location = _location_payload(row)
|
||||
key = (int(location["file_asset_id"]), int(location["backend_id"]))
|
||||
grouped.setdefault(key, []).append(location)
|
||||
|
||||
groups: list[dict[str, Any]] = []
|
||||
for (file_asset_id, backend_id), locations in grouped.items():
|
||||
ordered_locations = sorted(locations, key=_location_sort_key)
|
||||
keep = ordered_locations[0]
|
||||
groups.append(
|
||||
{
|
||||
"file_asset_id": int(file_asset_id),
|
||||
"backend_id": int(backend_id),
|
||||
"backend_name": keep["backend_name"],
|
||||
"song_id": int(keep["song_id"]),
|
||||
"song_name": keep["song_name"],
|
||||
"singers": keep["singers"],
|
||||
"keep": keep,
|
||||
"duplicates": ordered_locations[1:],
|
||||
}
|
||||
)
|
||||
groups.sort(
|
||||
key=lambda group: (
|
||||
int(group["song_id"]),
|
||||
int(group["file_asset_id"]),
|
||||
int(group["backend_id"]),
|
||||
)
|
||||
)
|
||||
return groups
|
||||
|
||||
def _build_scan_payload(
|
||||
self,
|
||||
groups: list[dict[str, Any]],
|
||||
*,
|
||||
scanned_active_local_location_count: int,
|
||||
sample_limit: int,
|
||||
) -> dict[str, Any]:
|
||||
normalized_sample_limit = max(int(sample_limit or 20), 1)
|
||||
return {
|
||||
"summary": {
|
||||
"duplicate_group_count": len(groups),
|
||||
"duplicate_location_count": sum(len(group["duplicates"]) for group in groups),
|
||||
"duplicate_file_size_bytes": sum(
|
||||
_duplicate_size_bytes(location)
|
||||
for group in groups
|
||||
for location in group["duplicates"]
|
||||
),
|
||||
"scanned_active_local_location_count": int(scanned_active_local_location_count),
|
||||
},
|
||||
"groups": [self._serialize_group(group) for group in groups[:normalized_sample_limit]],
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _serialize_group(group: dict[str, Any]) -> dict[str, Any]:
|
||||
return {
|
||||
"file_asset_id": int(group["file_asset_id"]),
|
||||
"backend_id": int(group["backend_id"]),
|
||||
"backend_name": str(group["backend_name"]),
|
||||
"song_id": int(group["song_id"]),
|
||||
"song_name": str(group["song_name"]),
|
||||
"singers": str(group["singers"]),
|
||||
"keep": LocalMaintenanceService._serialize_location(group["keep"]),
|
||||
"duplicates": [
|
||||
LocalMaintenanceService._serialize_location(location)
|
||||
for location in group["duplicates"]
|
||||
],
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _serialize_location(location: dict[str, Any]) -> dict[str, Any]:
|
||||
return {
|
||||
"id": int(location["id"]),
|
||||
"locator": str(location["locator"]),
|
||||
"absolute_path": str(location["absolute_path"]),
|
||||
"file_exists": bool(location["file_exists"]),
|
||||
"file_size_bytes": _coerce_int(location["file_size_bytes"]),
|
||||
"actual_file_size_bytes": _coerce_int(location["actual_file_size_bytes"]),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def _refresh_song_backend_presence_with_connection(
|
||||
conn: sqlite3.Connection,
|
||||
*,
|
||||
song_id: int,
|
||||
backend_id: int,
|
||||
) -> None:
|
||||
summary = conn.execute(
|
||||
"""
|
||||
SELECT
|
||||
COUNT(*) AS active_file_count,
|
||||
MIN(fl.id) AS primary_file_location_id
|
||||
FROM file_locations AS fl
|
||||
JOIN file_assets AS fa ON fa.id = fl.file_asset_id
|
||||
WHERE fa.song_id = ?
|
||||
AND fl.backend_id = ?
|
||||
AND fl.status = 'active'
|
||||
""",
|
||||
(int(song_id), int(backend_id)),
|
||||
).fetchone()
|
||||
active_file_count = int(summary["active_file_count"]) if summary else 0
|
||||
has_active_file = 1 if active_file_count > 0 else 0
|
||||
primary_file_location_id = summary["primary_file_location_id"] if summary else None
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT INTO song_backend_presence (
|
||||
song_id,
|
||||
backend_id,
|
||||
has_active_file,
|
||||
active_file_count,
|
||||
primary_file_location_id
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?)
|
||||
ON CONFLICT(song_id, backend_id) DO UPDATE SET
|
||||
has_active_file = excluded.has_active_file,
|
||||
active_file_count = excluded.active_file_count,
|
||||
primary_file_location_id = excluded.primary_file_location_id,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
""",
|
||||
(
|
||||
int(song_id),
|
||||
int(backend_id),
|
||||
has_active_file,
|
||||
active_file_count,
|
||||
primary_file_location_id,
|
||||
),
|
||||
)
|
||||
Reference in New Issue
Block a user