import tempfile import unittest from pathlib import Path class LocalMaintenanceTests(unittest.TestCase): def _build_repo(self): from musicdl.catalogsync.db import initialize_database from musicdl.catalogsync.repository import CatalogRepository tmpdir = tempfile.TemporaryDirectory(ignore_cleanup_errors=True) self.addCleanup(tmpdir.cleanup) root = Path(tmpdir.name) db_path = root / "catalogsync.db" initialize_database(db_path).close() return root, db_path, CatalogRepository(db_path) def _seed_duplicate_local_files(self): from musicdl.catalogsync.models import CatalogSong root, db_path, repo = self._build_repo() song_id = repo.upsert_song( CatalogSong( platform="qq", remote_song_id="song-dup-1", name="Duplicate Song", singers="Singer A", ext="flac", file_size_bytes=7, quality_label="lossless", metadata={}, ) ) library_root = root / "library" backend_id = repo.ensure_local_backend( library_root, name="default-local", is_default=True, ) asset_id = repo.record_local_file( song_id=song_id, backend_id=backend_id, relative_path="Singer A/Duplicate Song.flac", file_size_bytes=7, ext="flac", quality_label="lossless", ) repo.record_local_file( song_id=song_id, backend_id=backend_id, relative_path="Singer A/Duplicate Song (1).flac", file_size_bytes=7, ext="flac", quality_label="lossless", ) canonical_path = library_root / "Singer A" / "Duplicate Song.flac" duplicate_path = library_root / "Singer A" / "Duplicate Song (1).flac" canonical_path.parent.mkdir(parents=True, exist_ok=True) canonical_path.write_bytes(b"abcdefg") duplicate_path.write_bytes(b"abcdefg") canonical_row = repo._fetchone( "SELECT * FROM file_locations WHERE locator = ?", ("Singer A/Duplicate Song.flac",), ) duplicate_row = repo._fetchone( "SELECT * FROM file_locations WHERE locator = ?", ("Singer A/Duplicate Song (1).flac",), ) return { "root": root, "db_path": db_path, "repo": repo, "song_id": song_id, "backend_id": backend_id, "asset_id": asset_id, "canonical_location_id": int(canonical_row["id"]), "duplicate_location_id": int(duplicate_row["id"]), "canonical_path": canonical_path, "duplicate_path": duplicate_path, } def test_scan_local_duplicates_reports_groups_and_prefers_canonical_locator(self): from musicdl.catalogsync.ops.maintenance import LocalMaintenanceService seeded = self._seed_duplicate_local_files() service = LocalMaintenanceService(seeded["db_path"]) payload = service.scan_local_duplicates(sample_limit=10) self.assertEqual(1, payload["summary"]["duplicate_group_count"]) self.assertEqual(1, payload["summary"]["duplicate_location_count"]) self.assertEqual(2, payload["summary"]["scanned_active_local_location_count"]) self.assertEqual(1, len(payload["groups"])) group = payload["groups"][0] self.assertEqual(seeded["song_id"], group["song_id"]) self.assertEqual(seeded["backend_id"], group["backend_id"]) self.assertEqual("Duplicate Song", group["song_name"]) self.assertEqual(seeded["canonical_location_id"], group["keep"]["id"]) self.assertEqual("Singer A/Duplicate Song.flac", group["keep"]["locator"]) self.assertTrue(group["keep"]["file_exists"]) self.assertEqual(1, len(group["duplicates"])) self.assertEqual(seeded["duplicate_location_id"], group["duplicates"][0]["id"]) self.assertEqual( "Singer A/Duplicate Song (1).flac", group["duplicates"][0]["locator"], ) def test_dedupe_local_duplicates_repoints_references_and_deletes_duplicate_files(self): from musicdl.catalogsync.ops.models import JobStatus from musicdl.catalogsync.ops.repository import OpsRepository from musicdl.catalogsync.ops.maintenance import LocalMaintenanceService seeded = self._seed_duplicate_local_files() repo = seeded["repo"] ops_repo = OpsRepository(seeded["db_path"]) remote_backend_id = repo.upsert_object_storage_backend( name="test-bucket", container_name="music", endpoint="https://s3.example.invalid", region=None, base_prefix="catalogsync", credential_env_prefix="CATALOGSYNC_TEST", public_base_url="https://cdn.example.invalid", ) upload_task_id = repo.enqueue_upload_task( file_asset_id=seeded["asset_id"], source_location_id=seeded["duplicate_location_id"], target_backend_id=remote_backend_id, target_container_name="music", target_locator="Singer A/Duplicate Song.flac", ) job_id = ops_repo.create_job( job_type="upload_only", config_snapshot={}, status=JobStatus.QUEUED, ) stage_id = ops_repo.create_stage(job_run_id=job_id, stage_type="upload", seq_no=1) item_id = ops_repo.create_item( job_stage_id=stage_id, item_type="song_upload", item_key="upload:dup-song", song_id=seeded["song_id"], file_location_id=seeded["duplicate_location_id"], ) service = LocalMaintenanceService(seeded["db_path"]) payload = service.dedupe_local_duplicates(sample_limit=10) self.assertEqual(0, payload["summary"]["duplicate_group_count"]) self.assertEqual(0, payload["summary"]["duplicate_location_count"]) self.assertEqual(1, payload["execution"]["deduped_group_count"]) self.assertEqual(1, payload["execution"]["inactive_location_count"]) self.assertEqual(1, payload["execution"]["deleted_file_count"]) self.assertEqual(7, payload["execution"]["released_bytes"]) self.assertEqual(1, payload["execution"]["repointed_upload_task_count"]) self.assertEqual(1, payload["execution"]["repointed_job_item_count"]) duplicate_location = repo._fetchone( "SELECT status, is_primary FROM file_locations WHERE id = ?", (seeded["duplicate_location_id"],), ) self.assertEqual("inactive", duplicate_location["status"]) self.assertEqual(0, int(duplicate_location["is_primary"])) canonical_location = repo._fetchone( "SELECT status, is_primary FROM file_locations WHERE id = ?", (seeded["canonical_location_id"],), ) self.assertEqual("active", canonical_location["status"]) self.assertEqual(1, int(canonical_location["is_primary"])) upload_task = repo._fetchone( "SELECT source_location_id FROM upload_tasks WHERE id = ?", (upload_task_id,), ) self.assertEqual(seeded["canonical_location_id"], int(upload_task["source_location_id"])) job_item = ops_repo._fetchone( "SELECT file_location_id FROM job_items WHERE id = ?", (item_id,), ) self.assertEqual(seeded["canonical_location_id"], int(job_item["file_location_id"])) presence = repo.get_song_backend_presence( song_id=seeded["song_id"], backend_id=seeded["backend_id"], ) self.assertIsNotNone(presence) self.assertEqual(1, int(presence["active_file_count"])) self.assertEqual(seeded["canonical_location_id"], int(presence["primary_file_location_id"])) self.assertTrue(seeded["canonical_path"].exists()) self.assertFalse(seeded["duplicate_path"].exists()) def test_dedupe_local_duplicates_raises_when_jobs_or_items_are_running(self): from musicdl.catalogsync.ops.models import JobStatus from musicdl.catalogsync.ops.maintenance import ( LocalDedupeBlockedError, LocalMaintenanceService, ) from musicdl.catalogsync.ops.repository import OpsRepository seeded = self._seed_duplicate_local_files() ops_repo = OpsRepository(seeded["db_path"]) ops_repo.create_job( job_type="download_only", config_snapshot={}, status=JobStatus.RUNNING, ) service = LocalMaintenanceService(seeded["db_path"]) with self.assertRaises(LocalDedupeBlockedError): service.dedupe_local_duplicates()