from __future__ import annotations import json import os from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path from .repository import CatalogRepository def load_backend_config(backend_row) -> dict: return json.loads(backend_row["config_json"] or "{}") def normalize_prefix(value: str | None) -> str: return str(value or "").strip().strip("/") def build_target_locator(base_prefix: str | None, relative_locator: str) -> str: normalized_relative = str(relative_locator).strip().lstrip("/") normalized_prefix = normalize_prefix(base_prefix) if not normalized_prefix: return normalized_relative return f"{normalized_prefix}/{normalized_relative}" def derive_public_url(public_base_url: str | None, locator: str, base_prefix: str | None) -> str | None: base_url = str(public_base_url or "").strip().rstrip("/") if not base_url: return None normalized_prefix = normalize_prefix(base_prefix) normalized_locator = str(locator).strip().lstrip("/") if normalized_prefix: prefix_segment = f"{normalized_prefix}/" if base_url.endswith(f"/{normalized_prefix}") and normalized_locator.startswith(prefix_segment): normalized_locator = normalized_locator[len(prefix_segment) :] normalized_locator = normalized_locator.lstrip("/") if not normalized_locator: return base_url return f"{base_url}/{normalized_locator}" def build_s3_client(backend_row): config = load_backend_config(backend_row) credential_env_prefix = str(config.get("credential_env_prefix") or "").strip() if not credential_env_prefix: raise RuntimeError("Object storage backend is missing credential_env_prefix") access_key_id = os.getenv(f"{credential_env_prefix}_ACCESS_KEY_ID") secret_access_key = os.getenv(f"{credential_env_prefix}_SECRET_ACCESS_KEY") session_token = os.getenv(f"{credential_env_prefix}_SESSION_TOKEN") if not access_key_id or not secret_access_key: raise RuntimeError(f"Missing credentials for backend {backend_row['name']}") try: import boto3 from botocore.config import Config except ImportError as exc: raise RuntimeError("boto3 is required for object storage uploads") from exc addressing_style = str(config.get("addressing_style") or "").strip().lower() client_config = None if addressing_style in {"path", "virtual"}: client_config = Config(s3={"addressing_style": addressing_style}) region = str(config.get("region") or "").strip() if region.lower() == "auto": region = "" return boto3.client( "s3", endpoint_url=config.get("endpoint"), region_name=region or None, aws_access_key_id=access_key_id, aws_secret_access_key=secret_access_key, aws_session_token=session_token or None, config=client_config, ) class S3CompatibleUploader: def __init__(self, backend_row, client=None): self.backend = backend_row self.config = load_backend_config(backend_row) self.client = client or build_s3_client(backend_row) def upload_file(self, local_path: Path, container_name: str, locator: str) -> dict[str, str | None]: self.client.upload_file(str(local_path), container_name, locator, ExtraArgs=None) return { "public_url": derive_public_url( self.config.get("public_base_url"), locator, self.config.get("base_prefix"), ), "download_url": None, } class CatalogUploader: def __init__( self, repository: CatalogRepository, worker_count: int = 4, client_factory=None, ): self.repository = repository self.worker_count = max(1, worker_count) self.client_factory = client_factory or (lambda backend_row: build_s3_client(backend_row)) def get_backend(self, backend_name: str): backend = self.repository.get_backend_by_name(backend_name) if backend is None: raise RuntimeError(f"Unknown backend: {backend_name}") if backend["backend_type"] != "object_storage": raise RuntimeError(f"Backend {backend_name} is not object storage") return backend def enqueue_missing_uploads( self, backend_name: str, sources: list[str] | None = None, limit: int | None = None, playlist_ids: list[int] | None = None, ) -> int: backend = self.get_backend(backend_name) candidates = self.repository.list_missing_object_upload_candidates( target_backend_id=int(backend["id"]), sources=sources, limit=limit, playlist_ids=playlist_ids, ) queued_count = 0 seen_task_ids: set[int] = set() for candidate in candidates: task_id = self.repository.enqueue_upload_task( file_asset_id=int(candidate["file_asset_id"]), source_location_id=int(candidate["source_location_id"]), target_backend_id=int(backend["id"]), target_container_name=candidate["target_container_name"], target_locator=candidate["target_locator"], ) if task_id not in seen_task_ids: seen_task_ids.add(task_id) queued_count += 1 return queued_count def process_upload_task_row(self, task_row, backend_name: str) -> str: backend = self.get_backend(backend_name) source_path_text = task_row["absolute_path"] if task_row is not None else None uploader = None if source_path_text and Path(source_path_text).exists(): uploader = S3CompatibleUploader(backend, client=self.client_factory(backend)) return self._process_task(task_row, backend, uploader) def run(self, backend_name: str, limit: int | None = None) -> dict[str, int]: backend = self.get_backend(backend_name) backend_id = int(backend["id"]) pending_tasks = self.repository.list_pending_upload_tasks(target_backend_id=backend_id, limit=limit) uploader = None if any(row["absolute_path"] and Path(row["absolute_path"]).exists() for row in pending_tasks): uploader = S3CompatibleUploader(backend, client=self.client_factory(backend)) def worker(): local_summary = {"succeeded": 0, "failed": 0, "skipped": 0} while True: task = self.repository.claim_next_upload_task(target_backend_id=backend_id) if task is None: break result = self._process_task(task, backend, uploader) local_summary[result] += 1 return local_summary summary = { "queued": len(pending_tasks), "succeeded": 0, "failed": 0, "skipped": 0, "workers": self.worker_count, } with ThreadPoolExecutor(max_workers=self.worker_count) as executor: futures = [executor.submit(worker) for _ in range(self.worker_count)] for future in as_completed(futures): worker_summary = future.result() for key in ("succeeded", "failed", "skipped"): summary[key] += int(worker_summary[key]) return summary def _process_task(self, task, backend, uploader: S3CompatibleUploader | None) -> str: source_path_text = task["absolute_path"] source_path = Path(source_path_text) if source_path_text else None if source_path is None or not source_path.exists(): missing_path = str(source_path) if source_path is not None else "" self.repository.mark_upload_task_status( task_id=int(task["id"]), status="failed", last_error=f"Source file does not exist: {missing_path}", ) return "failed" try: active_uploader = uploader or S3CompatibleUploader(backend, client=self.client_factory(backend)) result = active_uploader.upload_file( local_path=source_path, container_name=task["target_container_name"] or backend["container_name"], locator=task["target_locator"], ) self.repository.record_remote_file( file_asset_id=int(task["file_asset_id"]), backend_id=int(task["target_backend_id"]), container_name=task["target_container_name"] or backend["container_name"], locator=task["target_locator"], public_url=result["public_url"], download_url=result["download_url"], ) self.repository.mark_upload_task_status( task_id=int(task["id"]), status="succeeded", last_error=None, ) return "succeeded" except Exception as exc: self.repository.mark_upload_task_status( task_id=int(task["id"]), status="failed", last_error=f"{type(exc).__name__}: {exc}", ) return "failed"