Initial import: Music_Server, MusicFree, catalog-sync

This commit is contained in:
2026-05-23 16:51:14 +08:00
commit 069af30dba
847 changed files with 179878 additions and 0 deletions
@@ -0,0 +1,12 @@
from .models import ItemStatus, JobItem, JobRun, JobStatus, JobStage, StageStatus
from .repository import OpsRepository
__all__ = [
"ItemStatus",
"JobItem",
"JobRun",
"JobStatus",
"JobStage",
"OpsRepository",
"StageStatus",
]
@@ -0,0 +1,91 @@
from __future__ import annotations
import hashlib
from pathlib import Path
from typing import Any
from .repository import OpsRepository
def _parse_sources(value: str | None) -> list[str]:
if not value:
return []
return [item.strip() for item in value.split(",") if item and item.strip()]
def _normalize_env_value(raw_value: str) -> str:
stripped_value = raw_value.strip()
if (
len(stripped_value) >= 2
and stripped_value[0] == stripped_value[-1]
and stripped_value[0] in {"'", '"'}
):
return stripped_value[1:-1]
return raw_value
def _parse_env(content: str) -> dict[str, str]:
mapping: dict[str, str] = {}
for raw_line in content.splitlines():
line = raw_line.strip()
if not line or line.startswith("#"):
continue
normalized = raw_line.lstrip()
if normalized.startswith("export "):
normalized = normalized[len("export ") :]
if "=" not in normalized:
continue
key, value = normalized.split("=", 1)
key = key.strip()
if not key:
continue
mapping[key] = _normalize_env_value(value)
return mapping
class CatalogsyncEnvManager:
def __init__(
self,
*,
db_path: str | Path,
env_file_path: str | Path,
repository: OpsRepository | None = None,
):
self.env_file_path = Path(env_file_path)
self.repository = repository or OpsRepository(db_path)
def load_current(self) -> dict[str, str]:
if not self.env_file_path.exists():
return {}
content = self.env_file_path.read_text(encoding="utf-8")
return _parse_env(content)
def build_job_snapshot(self) -> dict[str, Any]:
current = self.load_current()
snapshot: dict[str, Any] = dict(current)
snapshot["download_sources"] = _parse_sources(current.get("DOWNLOAD_SOURCES"))
return snapshot
def save_revision(self, note: str | None = None, source_type: str = "env_file") -> int:
content = ""
if self.env_file_path.exists():
content = self.env_file_path.read_text(encoding="utf-8")
content_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()
return self.repository.create_config_revision(
source_type=source_type,
file_path=str(self.env_file_path.resolve()),
content_text=content,
content_hash=content_hash,
note=note,
)
def list_revisions(self, limit: int = 50) -> list[dict[str, Any]]:
return self.repository.list_config_revisions(limit=limit)
def apply_revision(self, revision_id: int) -> None:
revision = self.repository.get_config_revision(revision_id)
if revision is None:
raise ValueError(f"config revision not found: {revision_id}")
self.env_file_path.parent.mkdir(parents=True, exist_ok=True)
self.env_file_path.write_text(revision["content_text"], encoding="utf-8")
self.repository.mark_config_revision_applied(revision_id)
@@ -0,0 +1,466 @@
from __future__ import annotations
import json
import threading
from dataclasses import dataclass
from pathlib import Path
from typing import Callable
from musicdl.catalogsync.downloader import CatalogDownloader
from musicdl.catalogsync.repository import CatalogRepository
from musicdl.catalogsync.services import CatalogSyncService
from musicdl.catalogsync.uploader import CatalogUploader
from .repository import OpsRepository
NON_MUSIC_RESOURCE_REASON = "非音乐资源(有声榜条目)"
NON_MUSIC_RESOURCE_CODE = "NON_MUSIC_RESOURCE"
@dataclass
class ResolvedStageDownloadTask:
item_id: int
playlist_id: int | None
row: dict[str, object]
resolved_payload: object
def _format_error(exc: Exception) -> str:
return f"{type(exc).__name__}: {exc}"
class _TransitionUpdateError(RuntimeError):
pass
def _ensure_transition_applied(applied: bool, *, item_id: int, action: str) -> None:
if applied:
return
raise _TransitionUpdateError(
f"CAS transition failed for item {item_id}: {action} returned False"
)
def _mark_failed_or_raise(ops_repo: OpsRepository, *, item_id: int, error_message: str, cause: Exception) -> None:
if ops_repo.mark_item_failed(item_id=item_id, error_message=error_message):
return
raise RuntimeError(
f"CAS transition failed for item {item_id}: mark_item_failed returned False while handling error: {error_message}"
) from cause
def _mark_non_music_resource_skipped_or_raise(ops_repo: OpsRepository, *, item_id: int) -> None:
_ensure_transition_applied(
ops_repo.mark_item_skipped(
item_id=item_id,
reason_message=NON_MUSIC_RESOURCE_REASON,
reason_code=NON_MUSIC_RESOURCE_CODE,
),
item_id=item_id,
action="mark_item_skipped",
)
def _is_non_music_resource_download_row(row: dict[str, object] | None) -> bool:
row = row or {}
remote_song_id = str(row.get("remote_song_id") or "").strip().lower()
if remote_song_id.startswith("qqtop_"):
return True
metadata_json = row.get("metadata_json")
if not metadata_json:
return False
try:
metadata = json.loads(str(metadata_json))
except Exception:
return False
if not isinstance(metadata, dict):
return False
snapshot = metadata.get("snapshot")
if not isinstance(snapshot, dict):
return False
raw_data = snapshot.get("raw_data")
if not isinstance(raw_data, dict):
return False
search = raw_data.get("search")
if not isinstance(search, dict):
return False
return bool(search.get("qq_toplist_fallback"))
class CollectStageExecutor:
def __init__(
self,
db_path: str | Path,
service: CatalogSyncService | None = None,
ops_repo: OpsRepository | None = None,
):
self.db_path = Path(db_path)
self.ops_repo = ops_repo or OpsRepository(self.db_path)
self.catalog_repo = CatalogRepository(self.db_path)
self.service = service or CatalogSyncService(repository=self.catalog_repo)
def process_item(self, item_id: int, worker_name: str, *, already_claimed: bool = False) -> None:
if not already_claimed:
self.ops_repo.claim_item(item_id=item_id, worker_name=worker_name)
try:
item = self.ops_repo.get_item(item_id)
if item is None:
raise RuntimeError(f"Unknown item: {item_id}")
source = str(item.payload.get("source") or "").strip()
if not source:
raise RuntimeError(f"Collect item {item_id} is missing source")
display_text = f"collect:{source}"
self.ops_repo.update_worker_state(
worker_name=worker_name,
current_job_item_id=item_id,
status="running",
current_display_text=display_text,
last_progress_text="starting playlist collection",
)
counts = self.service.collect_playlists(
sources=[source],
include_playlist_square=bool(item.payload.get("include_playlist_square", True)),
include_toplist=bool(item.payload.get("include_toplist", True)),
progress_callback=lambda event_type, payload: self.ops_repo.update_worker_state(
worker_name=worker_name,
current_job_item_id=item_id,
status="running",
current_display_text=display_text,
last_progress_text=self._format_progress_text(event_type, payload),
),
)
_ensure_transition_applied(
self.ops_repo.mark_item_succeeded(item_id=item_id, result_payload={"counts": counts}),
item_id=item_id,
action="mark_item_succeeded",
)
except Exception as exc:
failure_message = _format_error(exc)
_mark_failed_or_raise(
self.ops_repo,
item_id=item_id,
error_message=failure_message,
cause=exc,
)
if isinstance(exc, _TransitionUpdateError):
raise
@staticmethod
def _format_progress_text(event_type: str, payload: dict[str, object]) -> str:
if event_type == "playlist_square_page":
page = int(payload.get("page") or 0)
total = int(payload.get("total") or 0)
new_count = int(payload.get("new_count") or 0)
if payload.get("duplicate_page"):
return f"page {page}: duplicate page detected, stopping at {total}"
return f"page {page}: +{new_count}, total {total}"
if event_type == "toplist_collected":
return f"toplist: {int(payload.get('count') or 0)}"
if event_type == "source_finished":
counts = payload.get("counts") if isinstance(payload.get("counts"), dict) else {}
playlist_square = int(counts.get("playlist_square") or 0)
toplist = int(counts.get("toplist") or 0)
return f"done: square {playlist_square}, toplist {toplist}"
return str(event_type).replace("_", " ")
class DownloadStageExecutor:
def __init__(
self,
db_path: str | Path,
library_root: str | Path,
download_sources: list[str] | None = None,
downloader: CatalogDownloader | None = None,
ops_repo: OpsRepository | None = None,
):
self.db_path = Path(db_path)
self.library_root = Path(library_root)
self.download_sources = list(download_sources or [])
self.ops_repo = ops_repo or OpsRepository(self.db_path)
self.catalog_repo = CatalogRepository(self.db_path)
self.downloader = downloader or CatalogDownloader(repository=self.catalog_repo)
def process_resolve_item(
self,
item_id: int,
worker_name: str,
*,
ready_queue,
already_claimed: bool = False,
) -> None:
if not already_claimed:
self.ops_repo.claim_item(item_id=item_id, worker_name=worker_name)
row = self.ops_repo.build_download_row(item_id=item_id)
song_id = int(row.get("id") or row.get("song_id") or 0)
if song_id > 0 and self.catalog_repo.song_has_active_local_file(song_id):
self.ops_repo.update_worker_state(
worker_name=worker_name,
current_job_item_id=item_id,
status="running",
current_song_id=song_id,
current_playlist_id=row.get("playlist_id"),
current_display_text=str(row.get("name") or row.get("id") or song_id),
last_progress_text="already downloaded",
)
_ensure_transition_applied(
self.ops_repo.mark_item_succeeded(
item_id=item_id,
result_payload={"already_downloaded": True},
),
item_id=item_id,
action="mark_item_succeeded",
)
return
resolved_payload = self.downloader.resolve_song_row(
row=row,
library_root=self.library_root,
download_sources=self.download_sources,
worker_callback=lambda **state: self.ops_repo.update_worker_state(
worker_name=worker_name,
current_job_item_id=item_id,
status="running",
**state,
),
)
if resolved_payload is None:
if _is_non_music_resource_download_row(row):
_mark_non_music_resource_skipped_or_raise(self.ops_repo, item_id=item_id)
return
_ensure_transition_applied(
self.ops_repo.mark_item_failed(
item_id=item_id,
error_message="resolve returned no downloadable song",
),
item_id=item_id,
action="mark_item_failed",
)
return
ready_queue.put(
ResolvedStageDownloadTask(
item_id=item_id,
playlist_id=row.get("playlist_id"),
row=row,
resolved_payload=resolved_payload,
)
)
def process_download_task(self, task: ResolvedStageDownloadTask, worker_name: str) -> None:
try:
succeeded = self.downloader.download_resolved_song(
resolved_payload=task.resolved_payload,
worker_callback=lambda **state: self.ops_repo.update_worker_state(
worker_name=worker_name,
current_job_item_id=task.item_id,
status="running",
**state,
),
)
if succeeded:
_ensure_transition_applied(
self.ops_repo.mark_item_succeeded(item_id=task.item_id),
item_id=task.item_id,
action="mark_item_succeeded",
)
return
if _is_non_music_resource_download_row(task.row):
_mark_non_music_resource_skipped_or_raise(self.ops_repo, item_id=task.item_id)
return
_ensure_transition_applied(
self.ops_repo.mark_item_failed(
item_id=task.item_id,
error_message="download returned no file",
),
item_id=task.item_id,
action="mark_item_failed",
)
except Exception as exc:
if _is_non_music_resource_download_row(task.row):
_mark_non_music_resource_skipped_or_raise(self.ops_repo, item_id=task.item_id)
if isinstance(exc, _TransitionUpdateError):
raise
return
failure_message = _format_error(exc)
_mark_failed_or_raise(
self.ops_repo,
item_id=task.item_id,
error_message=failure_message,
cause=exc,
)
if isinstance(exc, _TransitionUpdateError):
raise
def process_item(self, item_id: int, worker_name: str, *, already_claimed: bool = False) -> None:
if not already_claimed:
self.ops_repo.claim_item(item_id=item_id, worker_name=worker_name)
row: dict[str, object] | None = None
try:
row = self.ops_repo.build_download_row(item_id=item_id)
song_id = int(row.get("id") or row.get("song_id") or 0)
if song_id > 0 and self.catalog_repo.song_has_active_local_file(song_id):
self.ops_repo.update_worker_state(
worker_name=worker_name,
current_job_item_id=item_id,
status="running",
current_song_id=song_id,
current_playlist_id=row.get("playlist_id"),
current_display_text=str(row.get("name") or row.get("id") or song_id),
last_progress_text="already downloaded",
)
_ensure_transition_applied(
self.ops_repo.mark_item_succeeded(
item_id=item_id,
result_payload={"already_downloaded": True},
),
item_id=item_id,
action="mark_item_succeeded",
)
return
succeeded = self.downloader.download_song_row(
row=row,
library_root=self.library_root,
download_sources=self.download_sources,
worker_callback=lambda **state: self.ops_repo.update_worker_state(
worker_name=worker_name,
current_job_item_id=item_id,
status="running",
**state,
),
)
if succeeded:
_ensure_transition_applied(
self.ops_repo.mark_item_succeeded(item_id=item_id),
item_id=item_id,
action="mark_item_succeeded",
)
return
if _is_non_music_resource_download_row(row):
_mark_non_music_resource_skipped_or_raise(self.ops_repo, item_id=item_id)
return
_ensure_transition_applied(
self.ops_repo.mark_item_failed(
item_id=item_id,
error_message="download returned no file",
),
item_id=item_id,
action="mark_item_failed",
)
except Exception as exc:
if _is_non_music_resource_download_row(row):
_mark_non_music_resource_skipped_or_raise(self.ops_repo, item_id=item_id)
if isinstance(exc, _TransitionUpdateError):
raise
return
failure_message = _format_error(exc)
_mark_failed_or_raise(
self.ops_repo,
item_id=item_id,
error_message=failure_message,
cause=exc,
)
if isinstance(exc, _TransitionUpdateError):
raise
class SyncStageExecutor:
def __init__(
self,
db_path: str | Path,
service: CatalogSyncService | None = None,
service_factory: Callable[[], CatalogSyncService] | None = None,
ops_repo: OpsRepository | None = None,
):
self.db_path = Path(db_path)
self.ops_repo = ops_repo or OpsRepository(self.db_path)
self.catalog_repo = CatalogRepository(self.db_path)
if service_factory is not None:
self._service_factory = service_factory
elif service is not None:
self._service_factory = lambda: service
else:
self._service_factory = lambda: CatalogSyncService(repository=self.catalog_repo)
self._service_local = threading.local()
def _get_service(self) -> CatalogSyncService:
service = getattr(self._service_local, "service", None)
if service is None:
service = self._service_factory()
self._service_local.service = service
return service
def process_item(self, item_id: int, worker_name: str, *, already_claimed: bool = False) -> None:
if not already_claimed:
self.ops_repo.claim_item(item_id=item_id, worker_name=worker_name)
try:
playlist_row = self.ops_repo.get_playlist_row_for_item(item_id=item_id)
linked_count = int(self._get_service().sync_playlist_row(playlist_row))
_ensure_transition_applied(
self.ops_repo.mark_item_succeeded(
item_id=item_id,
result_payload={"linked_count": linked_count},
),
item_id=item_id,
action="mark_item_succeeded",
)
except Exception as exc:
failure_message = _format_error(exc)
_mark_failed_or_raise(
self.ops_repo,
item_id=item_id,
error_message=failure_message,
cause=exc,
)
if isinstance(exc, _TransitionUpdateError):
raise
class UploadStageExecutor:
def __init__(
self,
db_path: str | Path,
backend_name: str,
uploader: CatalogUploader | None = None,
ops_repo: OpsRepository | None = None,
):
self.db_path = Path(db_path)
self.backend_name = str(backend_name)
self.ops_repo = ops_repo or OpsRepository(self.db_path)
self.catalog_repo = CatalogRepository(self.db_path)
self.uploader = uploader or CatalogUploader(repository=self.catalog_repo)
def process_item(self, item_id: int, worker_name: str, *, already_claimed: bool = False) -> None:
if not already_claimed:
self.ops_repo.claim_item(item_id=item_id, worker_name=worker_name)
try:
upload_row = self.ops_repo.get_upload_row_for_item(item_id=item_id)
result = str(
self.uploader.process_upload_task_row(
task_row=upload_row,
backend_name=self.backend_name,
)
)
if result == "succeeded":
_ensure_transition_applied(
self.ops_repo.mark_item_succeeded(item_id=item_id),
item_id=item_id,
action="mark_item_succeeded",
)
else:
_ensure_transition_applied(
self.ops_repo.mark_item_failed(
item_id=item_id,
error_message=f"upload result: {result}",
),
item_id=item_id,
action="mark_item_failed",
)
except Exception as exc:
failure_message = _format_error(exc)
_mark_failed_or_raise(
self.ops_repo,
item_id=item_id,
error_message=failure_message,
cause=exc,
)
if isinstance(exc, _TransitionUpdateError):
raise
@@ -0,0 +1,48 @@
from __future__ import annotations
from typing import Any
DOWNLOAD_LANE = "download"
GENERAL_LANE = "general"
JOB_STAGE_SEQUENCES: dict[str, tuple[str, ...]] = {
"catalog_sync": ("collect", "sync", "download"),
"collect_only": ("collect",),
"sync_only": ("sync",),
"sync_download": ("sync", "download"),
"download_only": ("download",),
"upload_only": ("upload",),
"download_upload": ("download", "upload"),
}
def job_has_stage(job_type: str, stage_type: str) -> bool:
sequence = JOB_STAGE_SEQUENCES.get(str(job_type), ())
return str(stage_type) in sequence
def job_lane_type(job_type: str) -> str:
if job_has_stage(job_type, "download"):
return DOWNLOAD_LANE
return GENERAL_LANE
def primary_stage_type(job_type: str) -> str | None:
for stage_type in ("download", "upload", "sync", "collect"):
if job_has_stage(job_type, stage_type):
return stage_type
return None
def display_name(job_type: str, playlist_scope: dict[str, Any] | None = None) -> str:
playlist_ids = (playlist_scope or {}).get("playlist_ids")
is_scoped = isinstance(playlist_ids, list) and len(playlist_ids) > 0
mapping = {
"catalog_sync": "Full Pipeline",
"collect_only": "Collect",
"sync_only": "Sync Selected Playlists" if is_scoped else "Sync",
"sync_download": "Sync Then Download" if is_scoped else "Sync Then Download All",
"download_only": "Download Selected Playlists" if is_scoped else "Download",
"upload_only": "Upload",
"download_upload": "Download Then Upload",
}
return mapping.get(str(job_type), str(job_type))
@@ -0,0 +1,402 @@
from __future__ import annotations
import re
import sqlite3
from contextlib import contextmanager, suppress
from pathlib import Path, PurePath
from typing import Any
from musicdl.catalogsync.db import connect_database
_COPY_SUFFIX_RE = re.compile(r" \(\d+\)(?=(\.[^.]+)?$)")
class LocalDedupeBlockedError(RuntimeError):
pass
def _coerce_int(value: Any) -> int | None:
try:
return int(value)
except (TypeError, ValueError):
return None
def _row_value(row: sqlite3.Row | dict[str, Any], key: str) -> Any:
if isinstance(row, sqlite3.Row):
try:
return row[key]
except IndexError:
return None
return row.get(key)
def _path_for_location(row: sqlite3.Row | dict[str, Any]) -> Path | None:
absolute_path = str(_row_value(row, "absolute_path") or "").strip()
if absolute_path:
return Path(absolute_path)
base_path = str(_row_value(row, "base_path") or "").strip()
locator = str(_row_value(row, "locator") or "").strip()
if not base_path or not locator:
return None
return Path(base_path) / locator
def _resolved_path(path: Path | None) -> Path | None:
if path is None:
return None
with suppress(OSError, RuntimeError):
return path.resolve(strict=False)
return path
def _paths_match(left: Path | None, right: Path | None) -> bool:
if left is None or right is None:
return False
return _resolved_path(left) == _resolved_path(right)
def _has_copy_suffix(locator: str | None) -> bool:
return bool(_COPY_SUFFIX_RE.search(PurePath(str(locator or "")).name))
def _location_payload(row: sqlite3.Row | dict[str, Any]) -> dict[str, Any]:
path = _path_for_location(row)
file_exists = bool(path and path.exists())
actual_file_size_bytes = None
if file_exists and path is not None:
with suppress(OSError):
actual_file_size_bytes = int(path.stat().st_size)
return {
"id": int(row["location_id"]),
"file_asset_id": int(row["file_asset_id"]),
"song_id": int(row["song_id"]),
"backend_id": int(row["backend_id"]),
"backend_name": str(row["backend_name"] or ""),
"locator": str(row["locator"] or ""),
"absolute_path": str(row["absolute_path"] or ""),
"file_exists": file_exists,
"file_size_bytes": _coerce_int(row["file_size_bytes"]),
"actual_file_size_bytes": actual_file_size_bytes,
"song_name": str(row["song_name"] or ""),
"singers": str(row["singers"] or ""),
"_path": path,
}
def _location_sort_key(location: dict[str, Any]) -> tuple[int, int, int, int]:
return (
0 if location["file_exists"] else 1,
0 if not _has_copy_suffix(location["locator"]) else 1,
len(location["locator"]),
int(location["id"]),
)
def _duplicate_size_bytes(location: dict[str, Any]) -> int:
size_value = location.get("actual_file_size_bytes")
if size_value is None:
size_value = location.get("file_size_bytes")
return max(int(size_value or 0), 0)
class LocalMaintenanceService:
def __init__(self, db_path: str | Path):
self.db_path = Path(db_path)
def _connect(self) -> sqlite3.Connection:
return connect_database(self.db_path)
@contextmanager
def _connection(self):
conn = self._connect()
try:
yield conn
conn.commit()
finally:
conn.close()
def scan_local_duplicates(self, *, sample_limit: int = 20) -> dict[str, Any]:
with self._connection() as conn:
groups = self._load_duplicate_groups(conn)
scanned_row = conn.execute(
"""
SELECT COUNT(*) AS count_value
FROM file_locations AS fl
JOIN storage_backends AS sb ON sb.id = fl.backend_id
WHERE fl.status = 'active'
AND sb.backend_type = 'local_fs'
"""
).fetchone()
return self._build_scan_payload(
groups,
scanned_active_local_location_count=int(scanned_row["count_value"]) if scanned_row else 0,
sample_limit=sample_limit,
)
def dedupe_local_duplicates(self, *, sample_limit: int = 20) -> dict[str, Any]:
with self._connection() as conn:
self._raise_if_running_work(conn)
groups = self._load_duplicate_groups(conn)
execution = {
"deduped_group_count": 0,
"inactive_location_count": 0,
"deleted_file_count": 0,
"released_bytes": 0,
"repointed_upload_task_count": 0,
"repointed_job_item_count": 0,
}
affected_pairs: set[tuple[int, int]] = set()
for group in groups:
keep = group["keep"]
duplicates = list(group["duplicates"])
if not duplicates:
continue
execution["deduped_group_count"] += 1
conn.execute(
"""
UPDATE file_locations
SET
is_primary = CASE WHEN id = ? THEN 1 ELSE 0 END,
updated_at = CURRENT_TIMESTAMP
WHERE file_asset_id = ? AND backend_id = ?
""",
(
int(keep["id"]),
int(group["file_asset_id"]),
int(group["backend_id"]),
),
)
for duplicate in duplicates:
duplicate_id = int(duplicate["id"])
upload_cursor = conn.execute(
"""
UPDATE upload_tasks
SET
source_location_id = ?,
updated_at = CURRENT_TIMESTAMP
WHERE source_location_id = ?
""",
(int(keep["id"]), duplicate_id),
)
execution["repointed_upload_task_count"] += max(upload_cursor.rowcount, 0)
item_cursor = conn.execute(
"""
UPDATE job_items
SET file_location_id = ?
WHERE file_location_id = ?
""",
(int(keep["id"]), duplicate_id),
)
execution["repointed_job_item_count"] += max(item_cursor.rowcount, 0)
inactive_cursor = conn.execute(
"""
UPDATE file_locations
SET
status = 'inactive',
is_primary = 0,
updated_at = CURRENT_TIMESTAMP
WHERE id = ? AND status = 'active'
""",
(duplicate_id,),
)
execution["inactive_location_count"] += max(inactive_cursor.rowcount, 0)
duplicate_path = duplicate["_path"]
if (
duplicate_path is not None
and duplicate_path.exists()
and not _paths_match(duplicate_path, keep["_path"])
):
duplicate_size_bytes = _duplicate_size_bytes(duplicate)
with suppress(OSError):
duplicate_path.unlink()
execution["deleted_file_count"] += 1
execution["released_bytes"] += duplicate_size_bytes
affected_pairs.add((int(group["song_id"]), int(group["backend_id"])))
for song_id, backend_id in affected_pairs:
self._refresh_song_backend_presence_with_connection(
conn,
song_id=song_id,
backend_id=backend_id,
)
payload = self.scan_local_duplicates(sample_limit=sample_limit)
payload["execution"] = execution
return payload
def _raise_if_running_work(self, conn: sqlite3.Connection) -> None:
running_jobs_row = conn.execute(
"SELECT COUNT(*) AS count_value FROM job_runs WHERE status = 'running'"
).fetchone()
running_items_row = conn.execute(
"SELECT COUNT(*) AS count_value FROM job_items WHERE status = 'running'"
).fetchone()
running_jobs = int(running_jobs_row["count_value"]) if running_jobs_row else 0
running_items = int(running_items_row["count_value"]) if running_items_row else 0
if running_jobs > 0 or running_items > 0:
raise LocalDedupeBlockedError(
f"cannot dedupe while jobs or items are running (jobs={running_jobs}, items={running_items})"
)
def _load_duplicate_groups(self, conn: sqlite3.Connection) -> list[dict[str, Any]]:
rows = conn.execute(
"""
WITH duplicate_pairs AS (
SELECT fl.file_asset_id, fl.backend_id
FROM file_locations AS fl
JOIN storage_backends AS sb ON sb.id = fl.backend_id
WHERE fl.status = 'active'
AND sb.backend_type = 'local_fs'
GROUP BY fl.file_asset_id, fl.backend_id
HAVING COUNT(*) > 1
)
SELECT
fl.id AS location_id,
fl.file_asset_id,
fa.song_id,
fl.backend_id,
sb.name AS backend_name,
sb.base_path,
fl.locator,
fl.absolute_path,
COALESCE(fa.file_size_bytes, s.file_size_bytes) AS file_size_bytes,
s.name AS song_name,
s.singers
FROM file_locations AS fl
JOIN duplicate_pairs AS dp
ON dp.file_asset_id = fl.file_asset_id
AND dp.backend_id = fl.backend_id
JOIN file_assets AS fa ON fa.id = fl.file_asset_id
JOIN songs AS s ON s.id = fa.song_id
JOIN storage_backends AS sb ON sb.id = fl.backend_id
WHERE fl.status = 'active'
ORDER BY fl.file_asset_id ASC, fl.backend_id ASC, fl.id ASC
"""
).fetchall()
grouped: dict[tuple[int, int], list[dict[str, Any]]] = {}
for row in rows:
location = _location_payload(row)
key = (int(location["file_asset_id"]), int(location["backend_id"]))
grouped.setdefault(key, []).append(location)
groups: list[dict[str, Any]] = []
for (file_asset_id, backend_id), locations in grouped.items():
ordered_locations = sorted(locations, key=_location_sort_key)
keep = ordered_locations[0]
groups.append(
{
"file_asset_id": int(file_asset_id),
"backend_id": int(backend_id),
"backend_name": keep["backend_name"],
"song_id": int(keep["song_id"]),
"song_name": keep["song_name"],
"singers": keep["singers"],
"keep": keep,
"duplicates": ordered_locations[1:],
}
)
groups.sort(
key=lambda group: (
int(group["song_id"]),
int(group["file_asset_id"]),
int(group["backend_id"]),
)
)
return groups
def _build_scan_payload(
self,
groups: list[dict[str, Any]],
*,
scanned_active_local_location_count: int,
sample_limit: int,
) -> dict[str, Any]:
normalized_sample_limit = max(int(sample_limit or 20), 1)
return {
"summary": {
"duplicate_group_count": len(groups),
"duplicate_location_count": sum(len(group["duplicates"]) for group in groups),
"duplicate_file_size_bytes": sum(
_duplicate_size_bytes(location)
for group in groups
for location in group["duplicates"]
),
"scanned_active_local_location_count": int(scanned_active_local_location_count),
},
"groups": [self._serialize_group(group) for group in groups[:normalized_sample_limit]],
}
@staticmethod
def _serialize_group(group: dict[str, Any]) -> dict[str, Any]:
return {
"file_asset_id": int(group["file_asset_id"]),
"backend_id": int(group["backend_id"]),
"backend_name": str(group["backend_name"]),
"song_id": int(group["song_id"]),
"song_name": str(group["song_name"]),
"singers": str(group["singers"]),
"keep": LocalMaintenanceService._serialize_location(group["keep"]),
"duplicates": [
LocalMaintenanceService._serialize_location(location)
for location in group["duplicates"]
],
}
@staticmethod
def _serialize_location(location: dict[str, Any]) -> dict[str, Any]:
return {
"id": int(location["id"]),
"locator": str(location["locator"]),
"absolute_path": str(location["absolute_path"]),
"file_exists": bool(location["file_exists"]),
"file_size_bytes": _coerce_int(location["file_size_bytes"]),
"actual_file_size_bytes": _coerce_int(location["actual_file_size_bytes"]),
}
@staticmethod
def _refresh_song_backend_presence_with_connection(
conn: sqlite3.Connection,
*,
song_id: int,
backend_id: int,
) -> None:
summary = conn.execute(
"""
SELECT
COUNT(*) AS active_file_count,
MIN(fl.id) AS primary_file_location_id
FROM file_locations AS fl
JOIN file_assets AS fa ON fa.id = fl.file_asset_id
WHERE fa.song_id = ?
AND fl.backend_id = ?
AND fl.status = 'active'
""",
(int(song_id), int(backend_id)),
).fetchone()
active_file_count = int(summary["active_file_count"]) if summary else 0
has_active_file = 1 if active_file_count > 0 else 0
primary_file_location_id = summary["primary_file_location_id"] if summary else None
conn.execute(
"""
INSERT INTO song_backend_presence (
song_id,
backend_id,
has_active_file,
active_file_count,
primary_file_location_id
)
VALUES (?, ?, ?, ?, ?)
ON CONFLICT(song_id, backend_id) DO UPDATE SET
has_active_file = excluded.has_active_file,
active_file_count = excluded.active_file_count,
primary_file_location_id = excluded.primary_file_location_id,
updated_at = CURRENT_TIMESTAMP
""",
(
int(song_id),
int(backend_id),
has_active_file,
active_file_count,
primary_file_location_id,
),
)
@@ -0,0 +1,93 @@
from __future__ import annotations
from dataclasses import dataclass
from enum import Enum
from typing import Any
class JobStatus(str, Enum):
QUEUED = "queued"
RUNNING = "running"
PAUSE_REQUESTED = "pause_requested"
PAUSED = "paused"
COMPLETED = "completed"
COMPLETED_WITH_ERRORS = "completed_with_errors"
FAILED = "failed"
CANCELED = "canceled"
class StageStatus(str, Enum):
PENDING = "pending"
RUNNING = "running"
PAUSE_REQUESTED = "pause_requested"
PAUSED = "paused"
COMPLETED = "completed"
FAILED = "failed"
SKIPPED = "skipped"
class ItemStatus(str, Enum):
PENDING = "pending"
RUNNING = "running"
SUCCEEDED = "succeeded"
FAILED = "failed"
INTERRUPTED = "interrupted"
SKIPPED = "skipped"
CANCELED = "canceled"
@dataclass(frozen=True)
class JobRun:
id: int
job_type: str
status: JobStatus
priority: int
requested_by: str | None
config_snapshot: dict[str, Any]
sources: list[str]
download_sources: list[str]
playlist_scope: dict[str, Any]
created_at: str | None
started_at: str | None
ended_at: str | None
last_error: str | None
resume_token: str | None
@dataclass(frozen=True)
class JobStage:
id: int
job_run_id: int
stage_type: str
seq_no: int
status: StageStatus
total_items: int
pending_items: int
running_items: int
success_items: int
failed_items: int
skipped_items: int
started_at: str | None
ended_at: str | None
last_error: str | None
@dataclass(frozen=True)
class JobItem:
id: int
job_stage_id: int
item_type: str
item_key: str
playlist_pool_id: int | None
playlist_id: int | None
song_id: int | None
file_location_id: int | None
status: ItemStatus
attempt_count: int
max_attempts: int
worker_id: int | None
started_at: str | None
ended_at: str | None
last_error: str | None
last_error_code: str | None
payload: dict[str, Any]
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,896 @@
from __future__ import annotations
import logging
import threading
import time
from collections import Counter
from concurrent.futures import Future, ThreadPoolExecutor
from pathlib import Path
from queue import Queue
from typing import Any
from musicdl.catalogsync.catalog_export import run_catalog_export_command
from musicdl.catalogsync.downloader import DownloadPlanner
from musicdl.catalogsync.repository import CatalogRepository
from musicdl.catalogsync.services import CatalogSyncService
from musicdl.catalogsync.uploader import CatalogUploader
from .jobdefs import DOWNLOAD_LANE, JOB_STAGE_SEQUENCES, job_lane_type
from .executors import (
CollectStageExecutor,
DownloadStageExecutor,
SyncStageExecutor,
UploadStageExecutor,
)
from .models import JobStatus, StageStatus
from .repository import OpsRepository
DEFAULT_DOWNLOAD_WORKERS = 10
DEFAULT_SYNC_WORKERS = 4
logger = logging.getLogger(__name__)
def _unique_preserve_order(values: list[str]) -> list[str]:
normalized: list[str] = []
seen: set[str] = set()
for value in values:
item = str(value).strip()
if not item or item in seen:
continue
normalized.append(item)
seen.add(item)
return normalized
def _split_csv(value: Any) -> list[str]:
if isinstance(value, list):
return [str(item).strip() for item in value if str(item).strip()]
if not value:
return []
return [part.strip() for part in str(value).split(",") if part.strip()]
def _int_value(value: Any, default: int) -> int:
try:
parsed = int(value)
except (TypeError, ValueError):
return default
return parsed if parsed > 0 else default
class OpsRunner:
def __init__(
self,
repository: OpsRepository,
sleep_seconds: float = 1.0,
*,
download_lane_concurrency: int = 1,
general_lane_concurrency: int = 3,
):
self.repository = repository
self.sleep_seconds = max(float(sleep_seconds), 0.1)
self.download_lane_concurrency = 1
self.general_lane_concurrency = max(int(general_lane_concurrency), 1)
self._job_pool = ThreadPoolExecutor(
max_workers=self.download_lane_concurrency + self.general_lane_concurrency
)
self._futures: dict[int, Future[None]] = {}
self._futures_lock = threading.Lock()
self._playlist_export_lock = threading.Lock()
self._catalog_export_lock = threading.Lock()
self._exported_stage_playlists: set[tuple[int, int]] = set()
self.db_path = Path(self.repository.db_path)
self.catalog_repo = CatalogRepository(self.db_path)
def recover_incomplete_jobs(self) -> None:
for job in self.repository.list_recoverable_jobs():
self.repository.pause_job_for_recovery(job.id)
for item in self.repository.list_running_items(job.id):
self.repository.mark_item_interrupted(
item.id,
last_error="Recovery interrupted running item after runner restart.",
)
self.repository.add_job_event(
job.id,
"recovery_requeued",
"Recovered incomplete job and re-queued resumable work.",
)
self.repository.resume_job(job.id)
def apply_pending_commands(self) -> None:
for command in self.repository.list_pending_commands():
command_type = str(command["command_type"])
job_id = int(command["job_run_id"])
command_id = int(command["id"])
target_item_id = command["target_item_id"]
if command_type == "pause":
self.repository.request_job_pause(job_id)
elif command_type == "resume":
self.repository.resume_job(job_id)
elif command_type == "cancel":
self.repository.cancel_job(job_id)
elif command_type == "retry_item":
if target_item_id is None:
self.repository.add_job_event(
job_id,
"ignored_command",
"retry_item command missing target_item_id.",
details={"command_type": command_type, "command_id": command_id},
)
elif not self.repository.requeue_item(
int(target_item_id), force=False, job_id=job_id
):
self.repository.add_job_event(
job_id,
"retry_rejected",
"retry_item command rejected.",
item_id=int(target_item_id),
details={"command_type": command_type, "command_id": command_id},
)
elif command_type == "force_retry_item":
if target_item_id is None:
self.repository.add_job_event(
job_id,
"ignored_command",
"force_retry_item command missing target_item_id.",
details={"command_type": command_type, "command_id": command_id},
)
elif not self.repository.requeue_item(
int(target_item_id), force=True, job_id=job_id
):
self.repository.add_job_event(
job_id,
"retry_rejected",
"force_retry_item command rejected.",
item_id=int(target_item_id),
details={"command_type": command_type, "command_id": command_id},
)
else:
self.repository.add_job_event(
job_id,
"ignored_command",
"Unsupported command type.",
details={"command_type": command_type, "command_id": command_id},
)
self.repository.mark_command_applied(command_id)
def reconcile_pause_state(self, job_id: int) -> None:
if self.repository.job_has_running_items(job_id):
return
self.repository.finalize_pause(job_id)
def run_forever(self, stop_event=None) -> None:
self.recover_incomplete_jobs()
while stop_event is None or not stop_event.is_set():
worked = self.loop_once()
if worked:
continue
if stop_event is not None:
stop_event.wait(self.sleep_seconds)
else:
time.sleep(self.sleep_seconds)
def loop_once(self) -> bool:
had_commands = bool(self.repository.list_pending_commands())
self.apply_pending_commands()
finished = self._reap_finished_jobs()
started = self._start_eligible_jobs()
return bool(had_commands or finished or started)
def _reap_finished_jobs(self) -> int:
finished_count = 0
finished_futures: list[tuple[int, Future[None]]] = []
with self._futures_lock:
for job_id, future in list(self._futures.items()):
if not future.done():
continue
del self._futures[job_id]
finished_futures.append((job_id, future))
for job_id, future in finished_futures:
try:
future.result()
except Exception as exc:
self.repository.add_job_event(
job_id,
"job_future_error",
str(exc),
)
job = self.repository.get_job(job_id)
if job is not None and job.status not in {
JobStatus.COMPLETED,
JobStatus.COMPLETED_WITH_ERRORS,
JobStatus.FAILED,
JobStatus.CANCELED,
JobStatus.PAUSED,
}:
self.repository.mark_job_finished(
job_id,
status=JobStatus.FAILED,
last_error=str(exc),
)
finished_count += 1
return finished_count
def _submit_job(self, job_id: int) -> bool:
with self._futures_lock:
if job_id in self._futures:
return False
self._futures[job_id] = self._job_pool.submit(self._run_job, job_id)
return True
def _start_eligible_jobs(self) -> int:
started_count = 0
active_jobs = self.repository.list_active_jobs()
lane_counts = Counter(job_lane_type(job.job_type) for job in active_jobs)
for active_job in active_jobs:
if active_job.status == JobStatus.PAUSE_REQUESTED:
self.reconcile_pause_state(active_job.id)
continue
if self._submit_job(active_job.id):
started_count += 1
for queued_job in self.repository.list_queued_jobs():
lane_type = job_lane_type(queued_job.job_type)
lane_limit = (
self.download_lane_concurrency
if lane_type == DOWNLOAD_LANE
else self.general_lane_concurrency
)
if lane_counts[lane_type] >= lane_limit:
continue
claimed = self.repository.claim_job_if_queued(queued_job.id)
if claimed is None:
continue
lane_counts[lane_type] += 1
if self._submit_job(claimed.id):
started_count += 1
return started_count
def _run_job(self, job_id: int) -> None:
try:
current_job = self.repository.get_job(job_id)
if current_job is None:
return
if current_job.status == JobStatus.CANCELED:
self.repository.finalize_canceled_job(job_id)
return
if current_job.status == JobStatus.PAUSE_REQUESTED:
self.reconcile_pause_state(job_id)
return
if current_job.status == JobStatus.PAUSED:
return
if not self.repository.mark_job_running(job_id):
current_job = self.repository.get_job(job_id)
if current_job is not None:
if current_job.status == JobStatus.CANCELED:
self.repository.finalize_canceled_job(job_id)
elif current_job.status == JobStatus.PAUSE_REQUESTED:
self.reconcile_pause_state(job_id)
return
current_job = self.repository.get_job(job_id)
if current_job is None:
return
self._ensure_job_stages(current_job)
while True:
current_job = self.repository.get_job(job_id)
if current_job is None:
return
if current_job.status == JobStatus.CANCELED:
self.repository.finalize_canceled_job(job_id)
return
if current_job.status == JobStatus.PAUSE_REQUESTED:
self.reconcile_pause_state(job_id)
return
stage = self._next_runnable_stage(job_id)
if stage is None:
if self._job_is_finished(job_id):
self._finalize_job(job_id)
return
stages = self.repository.list_job_stages(job_id)
if any(
stage_row.status in {StageStatus.PAUSED, StageStatus.PAUSE_REQUESTED}
for stage_row in stages
):
self.repository.pause_job_for_recovery(job_id)
return
raise RuntimeError("Job has no runnable stages but is not finished.")
self._run_stage(current_job, stage)
refreshed_job = self.repository.get_job(job_id)
if refreshed_job is None:
return
if refreshed_job.status == JobStatus.CANCELED:
self.repository.finalize_canceled_job(job_id)
return
if refreshed_job.status == JobStatus.PAUSE_REQUESTED:
self.reconcile_pause_state(job_id)
return
if self._job_is_finished(job_id):
self._finalize_job(job_id)
return
except Exception as exc:
self.repository.add_job_event(
job_id,
"job_execution_error",
str(exc),
)
job = self.repository.get_job(job_id)
if job is not None and job.status not in {
JobStatus.COMPLETED,
JobStatus.COMPLETED_WITH_ERRORS,
JobStatus.FAILED,
JobStatus.CANCELED,
JobStatus.PAUSED,
}:
self.repository.mark_job_finished(
job_id,
status=JobStatus.FAILED,
last_error=str(exc),
)
def _ensure_job_stages(self, job) -> None:
existing = self.repository.list_job_stages(job.id)
if existing:
return
for seq_no, stage_type in enumerate(
JOB_STAGE_SEQUENCES.get(str(job.job_type), []), start=1
):
self.repository.create_stage(job_run_id=job.id, stage_type=stage_type, seq_no=seq_no)
def _next_runnable_stage(self, job_id: int):
for stage in self.repository.list_job_stages(job_id):
if stage.status in {StageStatus.PENDING, StageStatus.RUNNING}:
return stage
return None
def _job_sources(self, job) -> list[str]:
return _unique_preserve_order(
list(job.sources or _split_csv(job.config_snapshot.get("SOURCES")))
)
def _job_download_sources(self, job) -> list[str]:
return _unique_preserve_order(
list(
job.download_sources
or _split_csv(job.config_snapshot.get("download_sources"))
or _split_csv(job.config_snapshot.get("DOWNLOAD_SOURCES"))
)
)
def _job_playlist_ids(self, job) -> list[int] | None:
raw_value = job.playlist_scope.get("playlist_ids")
if not isinstance(raw_value, list):
return None
playlist_ids = []
for item in raw_value:
try:
playlist_ids.append(int(item))
except (TypeError, ValueError):
continue
return playlist_ids or None
def _resolve_library_root(self, job) -> Path:
mapping = dict(job.config_snapshot or {})
library_dir = mapping.get("LIBRARY_DIR") or mapping.get("library_dir")
if library_dir:
return Path(str(library_dir)).resolve()
try:
backend = self.catalog_repo.get_backend(self.catalog_repo.get_default_backend_id())
except Exception:
backend = None
if backend and backend["base_path"]:
return Path(str(backend["base_path"])).resolve()
raise RuntimeError("No library root configured for download stage")
def _resolve_playlists_root(self, job) -> Path | None:
mapping = dict(job.config_snapshot or {})
root_dir = mapping.get("ROOT_DIR") or mapping.get("root_dir")
if root_dir:
path = Path(str(root_dir)).resolve() / "playlists"
path.mkdir(parents=True, exist_ok=True)
return path
library_dir = mapping.get("LIBRARY_DIR") or mapping.get("library_dir")
if library_dir:
path = Path(str(library_dir)).resolve().parent / "playlists"
path.mkdir(parents=True, exist_ok=True)
return path
library_root = self.catalog_repo.get_default_local_library_root()
if library_root is None:
return None
path = library_root.parent / "playlists"
path.mkdir(parents=True, exist_ok=True)
return path
def _mark_playlist_exported(self, stage_id: int, playlist_id: int) -> bool:
key = (int(stage_id), int(playlist_id))
with self._playlist_export_lock:
if key in self._exported_stage_playlists:
return False
self._exported_stage_playlists.add(key)
return True
def _forget_playlist_exported(self, stage_id: int, playlist_id: int) -> None:
key = (int(stage_id), int(playlist_id))
with self._playlist_export_lock:
self._exported_stage_playlists.discard(key)
def _export_playlist_artifacts_for_playlist_if_ready(self, job, stage, playlist_id: int | None) -> bool:
if str(stage.stage_type) != "download" or playlist_id is None:
return False
scoped_playlist_ids = self._job_playlist_ids(job)
normalized_playlist_id = int(playlist_id)
if not scoped_playlist_ids or normalized_playlist_id not in scoped_playlist_ids:
return False
if self.repository.playlist_has_open_items(stage.id, normalized_playlist_id):
return False
if not self._mark_playlist_exported(stage.id, normalized_playlist_id):
return False
playlists_root = self._resolve_playlists_root(job)
if playlists_root is None:
self.repository.add_job_event(
job.id,
"playlist_export_skipped",
"Playlists root is not configured for scoped download export.",
stage_id=stage.id,
details={"playlist_id": normalized_playlist_id},
)
return False
service = CatalogSyncService(
repository=self.catalog_repo,
playlists_root=playlists_root,
)
try:
folder_path = service.ensure_playlist_artifacts_for_playlist(normalized_playlist_id)
except Exception as exc:
self._forget_playlist_exported(stage.id, normalized_playlist_id)
self.repository.add_job_event(
job.id,
"playlist_export_error",
str(exc),
stage_id=stage.id,
details={"playlist_id": normalized_playlist_id},
)
return False
if folder_path is None:
self.repository.add_job_event(
job.id,
"playlist_export_skipped",
"Playlist export row is unavailable.",
stage_id=stage.id,
details={"playlist_id": normalized_playlist_id},
)
return False
self.repository.add_job_event(
job.id,
"playlist_export_ready",
f"Exported playlist artifacts for playlist {normalized_playlist_id}.",
stage_id=stage.id,
details={"playlist_id": normalized_playlist_id, "playlist_dir": str(folder_path)},
)
return True
def _refresh_ready_playlist_artifacts(self, job, stage) -> list[int]:
if str(stage.stage_type) != "download":
return []
playlist_ids = self._job_playlist_ids(job)
if not playlist_ids:
return []
exported_ids: list[int] = []
for playlist_id in playlist_ids:
if self._export_playlist_artifacts_for_playlist_if_ready(job, stage, int(playlist_id)):
exported_ids.append(int(playlist_id))
return exported_ids
def _resolve_backend_name(self, job) -> str:
value = (
job.config_snapshot.get("OBJECT_BACKEND_NAME")
or job.config_snapshot.get("object_backend_name")
or ""
)
return str(value).strip()
def _worker_count(self, job, stage_type: str) -> int:
mapping = dict(job.config_snapshot or {})
if stage_type == "download":
return _int_value(mapping.get("DOWNLOAD_WORKERS"), DEFAULT_DOWNLOAD_WORKERS)
if stage_type == "sync":
return _int_value(mapping.get("SYNC_WORKERS"), DEFAULT_SYNC_WORKERS)
if stage_type == "upload":
return _int_value(mapping.get("UPLOAD_WORKERS"), 4)
return 1
def _download_stage_worker_split(self, total_workers: int) -> tuple[int, int]:
normalized_total = max(int(total_workers or 0), 1)
if normalized_total == 1:
return 1, 0
if normalized_total == 2:
return 1, 1
if normalized_total <= 5:
download_workers = 1
else:
download_workers = 2
resolver_workers = max(1, normalized_total - download_workers)
return resolver_workers, download_workers
def _materialize_stage_items(self, job, stage) -> None:
refreshed_stage = self.repository.get_stage(stage.id)
if refreshed_stage is None or refreshed_stage.total_items > 0:
return
playlist_ids = self._job_playlist_ids(job)
if stage.stage_type == "collect":
for source in self._job_sources(job):
self.repository.create_item(
job_stage_id=stage.id,
item_type="collect_source",
item_key=f"collect:{source}",
payload={
"source": source,
"include_playlist_square": True,
"include_toplist": True,
},
)
return
if stage.stage_type == "sync":
if playlist_ids:
playlist_rows = self.catalog_repo.list_playlists_by_ids(playlist_ids)
else:
playlist_rows = self.catalog_repo.list_playlists(sources=self._job_sources(job))
for row in playlist_rows:
playlist_id = int(row["id"])
self.repository.create_item(
job_stage_id=stage.id,
item_type="playlist_sync",
item_key=f"playlist:{playlist_id}",
playlist_id=playlist_id,
payload={"playlist_row": dict(row)},
)
return
if stage.stage_type == "download":
planner = DownloadPlanner(self.catalog_repo)
for row in planner.build_download_queue(
sources=self._job_sources(job),
playlist_ids=playlist_ids,
):
song_id = int(row.get("song_id") or row["id"])
self.repository.create_item(
job_stage_id=stage.id,
item_type="song_download",
item_key=f"song:{song_id}",
song_id=song_id,
playlist_id=row.get("playlist_id"),
payload={"row": dict(row)},
)
return
if stage.stage_type == "upload":
backend_name = self._resolve_backend_name(job)
if not backend_name:
return
uploader = CatalogUploader(self.catalog_repo)
uploader.enqueue_missing_uploads(
backend_name=backend_name,
sources=self._job_sources(job) or None,
playlist_ids=playlist_ids,
)
backend = self.catalog_repo.get_backend_by_name(backend_name)
if backend is None:
return
rows = self.catalog_repo.list_pending_upload_tasks(target_backend_id=int(backend["id"]))
for row in rows:
upload_task_id = int(row["id"])
self.repository.create_item(
job_stage_id=stage.id,
item_type="file_upload",
item_key=f"upload:{upload_task_id}",
file_location_id=row["source_location_id"],
payload={
"upload_task_id": upload_task_id,
"upload_row": dict(row),
},
)
def _build_executor(self, job, stage):
if stage.stage_type == "collect":
return CollectStageExecutor(self.db_path, ops_repo=self.repository)
if stage.stage_type == "sync":
return SyncStageExecutor(self.db_path, ops_repo=self.repository)
if stage.stage_type == "download":
return DownloadStageExecutor(
self.db_path,
library_root=self._resolve_library_root(job),
download_sources=self._job_download_sources(job),
ops_repo=self.repository,
)
if stage.stage_type == "upload":
backend_name = self._resolve_backend_name(job)
if not backend_name:
raise RuntimeError("No object backend configured for upload stage")
return UploadStageExecutor(
self.db_path,
backend_name=backend_name,
ops_repo=self.repository,
)
raise RuntimeError(f"Unsupported stage type: {stage.stage_type}")
def _export_playlist_artifacts_for_job(self, job, stage) -> None:
exported_ids = self._refresh_ready_playlist_artifacts(job, stage)
playlist_ids = self._job_playlist_ids(job) or []
if str(stage.stage_type) != "download" or not playlist_ids:
return
try:
self.repository.add_job_event(
job.id,
"playlist_exported",
f"Refreshed playlist export folders for {len(exported_ids)} playlists.",
stage_id=stage.id,
details={"playlist_ids": exported_ids, "scoped_playlist_ids": playlist_ids},
)
except Exception:
logger.warning(
"Failed to persist playlist_exported event for job %s stage %s.",
job.id,
stage.id,
exc_info=True,
)
def _run_catalog_export_for_stage(self, job, stage) -> None:
if str(stage.stage_type) != "download":
return
with self._catalog_export_lock:
refreshed_job = self.repository.get_job(job.id) or job
if refreshed_job.status in {
JobStatus.CANCELED,
JobStatus.PAUSE_REQUESTED,
JobStatus.PAUSED,
}:
return
self.repository.add_job_event(
job.id,
"catalog_export_started",
"Started post-download catalog export command.",
stage_id=stage.id,
)
try:
result = run_catalog_export_command(refreshed_job.config_snapshot)
except Exception as exc:
self.repository.add_job_event(
job.id,
"catalog_export_failed",
f"Catalog export command raised an error: {exc}",
stage_id=stage.id,
details={"error": str(exc) or exc.__class__.__name__},
)
return
details: dict[str, Any] = {}
if result.command:
details["command"] = result.command
if result.workdir:
details["workdir"] = result.workdir
if result.returncode is not None:
details["returncode"] = result.returncode
if result.stdout:
details["stdout"] = result.stdout
if result.stderr:
details["stderr"] = result.stderr
normalized_status = str(result.status).strip().lower()
if normalized_status == "succeeded":
event_type = "catalog_export_succeeded"
message = "Catalog export command completed successfully."
elif normalized_status == "skipped":
event_type = "catalog_export_skipped"
message = "Catalog export command was skipped."
else:
event_type = "catalog_export_failed"
message = "Catalog export command failed."
self.repository.add_job_event(
job.id,
event_type,
message,
stage_id=stage.id,
details=details or None,
)
def _run_stage_with_single_pool(self, job, stage, executor, worker_count: int) -> None:
def worker_loop(worker_index: int) -> None:
worker_name = f"{stage.stage_type}-{worker_index + 1}"
while True:
active_job = self.repository.get_job(job.id)
if active_job is None or active_job.status in {
JobStatus.PAUSE_REQUESTED,
JobStatus.CANCELED,
}:
return
item = self.repository.claim_next_stage_item(stage.id, worker_name)
if item is None:
return
try:
executor.process_item(item.id, worker_name, already_claimed=True)
self._export_playlist_artifacts_for_playlist_if_ready(
job,
stage,
item.playlist_id,
)
except Exception as exc:
self.repository.add_job_event(
job.id,
"item_execution_error",
str(exc),
stage_id=stage.id,
item_id=item.id,
)
with ThreadPoolExecutor(max_workers=worker_count) as pool:
futures = [pool.submit(worker_loop, index) for index in range(worker_count)]
for future in futures:
future.result()
def _run_download_stage_pipeline(self, job, stage, executor, worker_count: int) -> None:
resolver_workers, download_workers = self._download_stage_worker_split(worker_count)
if download_workers == 0:
self._run_stage_with_single_pool(job, stage, executor, worker_count)
return
ready_queue: Queue = Queue(maxsize=max(1, download_workers * 2))
stop_event = threading.Event()
sentinel = object()
def resolver_loop(worker_index: int) -> None:
worker_name = f"resolve-{worker_index + 1}"
while not stop_event.is_set():
active_job = self.repository.get_job(job.id)
if active_job is None or active_job.status in {
JobStatus.PAUSE_REQUESTED,
JobStatus.CANCELED,
}:
stop_event.set()
return
item = self.repository.claim_next_stage_item(stage.id, worker_name)
if item is None:
return
try:
executor.process_resolve_item(
item.id,
worker_name,
ready_queue=ready_queue,
already_claimed=True,
)
self._export_playlist_artifacts_for_playlist_if_ready(
job,
stage,
item.playlist_id,
)
except Exception as exc:
self.repository.add_job_event(
job.id,
"item_execution_error",
str(exc),
stage_id=stage.id,
item_id=item.id,
)
def download_loop(worker_index: int) -> None:
worker_name = f"download-{worker_index + 1}"
while True:
task = ready_queue.get()
if task is sentinel:
return
try:
executor.process_download_task(task, worker_name)
self._export_playlist_artifacts_for_playlist_if_ready(
job,
stage,
getattr(task, "playlist_id", None),
)
except Exception as exc:
self.repository.add_job_event(
job.id,
"item_execution_error",
str(exc),
stage_id=stage.id,
item_id=getattr(task, "item_id", None),
)
with ThreadPoolExecutor(max_workers=resolver_workers + download_workers) as pool:
resolver_futures = [pool.submit(resolver_loop, index) for index in range(resolver_workers)]
download_futures = [pool.submit(download_loop, index) for index in range(download_workers)]
for future in resolver_futures:
future.result()
for _ in range(download_workers):
ready_queue.put(sentinel)
for future in download_futures:
future.result()
def _run_stage(self, job, stage) -> None:
if stage.status == StageStatus.PENDING:
self.repository.mark_stage_running(stage.id)
self.repository.add_job_event(
job.id,
"stage_started",
f"Started stage {stage.stage_type}.",
stage_id=stage.id,
)
self._materialize_stage_items(job, stage)
refreshed_stage = self.repository.get_stage(stage.id)
if refreshed_stage is None:
return
if refreshed_stage.total_items == 0:
self.repository.mark_stage_finished(stage.id, status=StageStatus.COMPLETED)
final_stage = self.repository.get_stage(stage.id)
if final_stage is not None:
self._export_playlist_artifacts_for_job(job, final_stage)
self._run_catalog_export_for_stage(job, final_stage)
return
executor = self._build_executor(job, refreshed_stage)
worker_count = self._worker_count(job, refreshed_stage.stage_type)
if refreshed_stage.stage_type == "download":
self._run_download_stage_pipeline(job, refreshed_stage, executor, worker_count)
else:
self._run_stage_with_single_pool(job, refreshed_stage, executor, worker_count)
current_job = self.repository.get_job(job.id)
if current_job is not None:
if current_job.status == JobStatus.CANCELED:
self.repository.finalize_canceled_job(job.id)
return
if current_job.status == JobStatus.PAUSE_REQUESTED:
self.reconcile_pause_state(job.id)
return
current_stage = self.repository.get_stage(stage.id)
if current_stage is None:
return
if self.repository.stage_has_open_items(stage.id):
return
if current_stage.failed_items > 0:
self.repository.mark_stage_finished(
stage.id,
status=StageStatus.FAILED,
last_error="One or more stage items failed.",
)
else:
self.repository.mark_stage_finished(stage.id, status=StageStatus.COMPLETED)
final_stage = self.repository.get_stage(stage.id)
if final_stage is not None:
self._export_playlist_artifacts_for_job(job, final_stage)
self._run_catalog_export_for_stage(job, final_stage)
def _job_is_finished(self, job_id: int) -> bool:
stages = self.repository.list_job_stages(job_id)
if not stages:
return True
return all(
stage.status in {StageStatus.COMPLETED, StageStatus.FAILED, StageStatus.SKIPPED}
for stage in stages
)
def _finalize_job(self, job_id: int) -> None:
stages = self.repository.list_job_stages(job_id)
if not stages:
self.repository.mark_job_finished(job_id, status=JobStatus.COMPLETED)
return
has_errors = any(
stage.status == StageStatus.FAILED or stage.failed_items > 0 for stage in stages
)
self.repository.mark_job_finished(
job_id,
status=JobStatus.COMPLETED_WITH_ERRORS if has_errors else JobStatus.COMPLETED,
last_error="One or more stage items failed." if has_errors else None,
)
File diff suppressed because it is too large Load Diff