Files
musicdl-catalog-sync-suite/catalog-sync/musicdl/catalogsync/uploader.py
T

222 lines
9.1 KiB
Python

from __future__ import annotations
import json
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from .repository import CatalogRepository
def load_backend_config(backend_row) -> dict:
return json.loads(backend_row["config_json"] or "{}")
def normalize_prefix(value: str | None) -> str:
return str(value or "").strip().strip("/")
def build_target_locator(base_prefix: str | None, relative_locator: str) -> str:
normalized_relative = str(relative_locator).strip().lstrip("/")
normalized_prefix = normalize_prefix(base_prefix)
if not normalized_prefix:
return normalized_relative
return f"{normalized_prefix}/{normalized_relative}"
def derive_public_url(public_base_url: str | None, locator: str, base_prefix: str | None) -> str | None:
base_url = str(public_base_url or "").strip().rstrip("/")
if not base_url:
return None
normalized_prefix = normalize_prefix(base_prefix)
normalized_locator = str(locator).strip().lstrip("/")
if normalized_prefix:
prefix_segment = f"{normalized_prefix}/"
if base_url.endswith(f"/{normalized_prefix}") and normalized_locator.startswith(prefix_segment):
normalized_locator = normalized_locator[len(prefix_segment) :]
normalized_locator = normalized_locator.lstrip("/")
if not normalized_locator:
return base_url
return f"{base_url}/{normalized_locator}"
def build_s3_client(backend_row):
config = load_backend_config(backend_row)
credential_env_prefix = str(config.get("credential_env_prefix") or "").strip()
if not credential_env_prefix:
raise RuntimeError("Object storage backend is missing credential_env_prefix")
access_key_id = os.getenv(f"{credential_env_prefix}_ACCESS_KEY_ID")
secret_access_key = os.getenv(f"{credential_env_prefix}_SECRET_ACCESS_KEY")
session_token = os.getenv(f"{credential_env_prefix}_SESSION_TOKEN")
if not access_key_id or not secret_access_key:
raise RuntimeError(f"Missing credentials for backend {backend_row['name']}")
try:
import boto3
from botocore.config import Config
except ImportError as exc:
raise RuntimeError("boto3 is required for object storage uploads") from exc
addressing_style = str(config.get("addressing_style") or "").strip().lower()
client_config = None
if addressing_style in {"path", "virtual"}:
client_config = Config(s3={"addressing_style": addressing_style})
region = str(config.get("region") or "").strip()
if region.lower() == "auto":
region = ""
return boto3.client(
"s3",
endpoint_url=config.get("endpoint"),
region_name=region or None,
aws_access_key_id=access_key_id,
aws_secret_access_key=secret_access_key,
aws_session_token=session_token or None,
config=client_config,
)
class S3CompatibleUploader:
def __init__(self, backend_row, client=None):
self.backend = backend_row
self.config = load_backend_config(backend_row)
self.client = client or build_s3_client(backend_row)
def upload_file(self, local_path: Path, container_name: str, locator: str) -> dict[str, str | None]:
self.client.upload_file(str(local_path), container_name, locator, ExtraArgs=None)
return {
"public_url": derive_public_url(
self.config.get("public_base_url"),
locator,
self.config.get("base_prefix"),
),
"download_url": None,
}
class CatalogUploader:
def __init__(
self,
repository: CatalogRepository,
worker_count: int = 4,
client_factory=None,
):
self.repository = repository
self.worker_count = max(1, worker_count)
self.client_factory = client_factory or (lambda backend_row: build_s3_client(backend_row))
def get_backend(self, backend_name: str):
backend = self.repository.get_backend_by_name(backend_name)
if backend is None:
raise RuntimeError(f"Unknown backend: {backend_name}")
if backend["backend_type"] != "object_storage":
raise RuntimeError(f"Backend {backend_name} is not object storage")
return backend
def enqueue_missing_uploads(
self,
backend_name: str,
sources: list[str] | None = None,
limit: int | None = None,
playlist_ids: list[int] | None = None,
) -> int:
backend = self.get_backend(backend_name)
candidates = self.repository.list_missing_object_upload_candidates(
target_backend_id=int(backend["id"]),
sources=sources,
limit=limit,
playlist_ids=playlist_ids,
)
queued_count = 0
seen_task_ids: set[int] = set()
for candidate in candidates:
task_id = self.repository.enqueue_upload_task(
file_asset_id=int(candidate["file_asset_id"]),
source_location_id=int(candidate["source_location_id"]),
target_backend_id=int(backend["id"]),
target_container_name=candidate["target_container_name"],
target_locator=candidate["target_locator"],
)
if task_id not in seen_task_ids:
seen_task_ids.add(task_id)
queued_count += 1
return queued_count
def process_upload_task_row(self, task_row, backend_name: str) -> str:
backend = self.get_backend(backend_name)
source_path_text = task_row["absolute_path"] if task_row is not None else None
uploader = None
if source_path_text and Path(source_path_text).exists():
uploader = S3CompatibleUploader(backend, client=self.client_factory(backend))
return self._process_task(task_row, backend, uploader)
def run(self, backend_name: str, limit: int | None = None) -> dict[str, int]:
backend = self.get_backend(backend_name)
backend_id = int(backend["id"])
pending_tasks = self.repository.list_pending_upload_tasks(target_backend_id=backend_id, limit=limit)
uploader = None
if any(row["absolute_path"] and Path(row["absolute_path"]).exists() for row in pending_tasks):
uploader = S3CompatibleUploader(backend, client=self.client_factory(backend))
def worker():
local_summary = {"succeeded": 0, "failed": 0, "skipped": 0}
while True:
task = self.repository.claim_next_upload_task(target_backend_id=backend_id)
if task is None:
break
result = self._process_task(task, backend, uploader)
local_summary[result] += 1
return local_summary
summary = {
"queued": len(pending_tasks),
"succeeded": 0,
"failed": 0,
"skipped": 0,
"workers": self.worker_count,
}
with ThreadPoolExecutor(max_workers=self.worker_count) as executor:
futures = [executor.submit(worker) for _ in range(self.worker_count)]
for future in as_completed(futures):
worker_summary = future.result()
for key in ("succeeded", "failed", "skipped"):
summary[key] += int(worker_summary[key])
return summary
def _process_task(self, task, backend, uploader: S3CompatibleUploader | None) -> str:
source_path_text = task["absolute_path"]
source_path = Path(source_path_text) if source_path_text else None
if source_path is None or not source_path.exists():
missing_path = str(source_path) if source_path is not None else "<missing>"
self.repository.mark_upload_task_status(
task_id=int(task["id"]),
status="failed",
last_error=f"Source file does not exist: {missing_path}",
)
return "failed"
try:
active_uploader = uploader or S3CompatibleUploader(backend, client=self.client_factory(backend))
result = active_uploader.upload_file(
local_path=source_path,
container_name=task["target_container_name"] or backend["container_name"],
locator=task["target_locator"],
)
self.repository.record_remote_file(
file_asset_id=int(task["file_asset_id"]),
backend_id=int(task["target_backend_id"]),
container_name=task["target_container_name"] or backend["container_name"],
locator=task["target_locator"],
public_url=result["public_url"],
download_url=result["download_url"],
)
self.repository.mark_upload_task_status(
task_id=int(task["id"]),
status="succeeded",
last_error=None,
)
return "succeeded"
except Exception as exc:
self.repository.mark_upload_task_status(
task_id=int(task["id"]),
status="failed",
last_error=f"{type(exc).__name__}: {exc}",
)
return "failed"