Files
musicdl-catalog-sync-suite/catalog-sync/musicdl/modules/utils/lyric.py
T

142 lines
8.9 KiB
Python

'''
Function:
Implementation of Lyric Related Utils
Author:
Zhenchao Jin
WeChat Official Account (微信公众号):
Charles的皮卡丘
'''
from __future__ import annotations
import os
import re
import copy
import random
import tempfile
import requests
from typing import Optional
from .misc import resp2json
from urllib.parse import quote
from .importutils import optionalimportfrom
'''cleanlrc'''
cleanlrc = lambda text: "\n".join(line for raw in re.sub(r"\r\n?", "\n", str(text)).split("\n") if (line := raw.strip("\ufeff\u200b\u200c\u200d\u2060\u00a0 \t").strip()) and not re.fullmatch(r"\[(\d{2}:)?\d{2}:\d{2}(?:\.\d{1,3})?\]", line))
'''fractoseconds'''
def fractoseconds(frac: str | None) -> float:
if not frac: return 0.0
scale = 10 ** len(frac)
return int(frac) / scale
'''extractdurationsecondsfromlrc'''
def extractdurationsecondsfromlrc(lrc: str) -> Optional[float]:
if not lrc or (lrc == 'NULL'): return None
max_t, time_pattern_re = None, re.compile(r"\[(?:(\d{1,2}):)?(\d{1,2}):(\d{2})(?:\.(\d{1,3}))?\]")
for h, m, s, frac in time_pattern_re.findall(lrc):
hh = int(h) if h else 0; mm = int(m); ss = int(s)
t = hh * 3600 + mm * 60 + ss + fractoseconds(frac)
max_t = t if (max_t is None or t > max_t) else max_t
return max_t
'''WhisperLRC'''
class WhisperLRC:
def __init__(self, model_size_or_path="small", device="auto", compute_type="int8", cpu_threads=4, num_workers=1, **kwargs):
WhisperModel = optionalimportfrom('faster_whisper', 'WhisperModel')
self.whisper_model = WhisperModel(model_size_or_path, device=device, compute_type=compute_type, cpu_threads=cpu_threads, num_workers=num_workers, **kwargs) if WhisperModel else None
'''downloadtotmpdir'''
@staticmethod
def downloadtotmpdir(url: str, headers: dict = None, timeout: int = 300, cookies: dict = None, request_overrides: dict = None):
headers, cookies, request_overrides = headers or {}, cookies or {}, copy.deepcopy(request_overrides or {})
if 'headers' not in request_overrides: request_overrides['headers'] = headers
if 'timeout' not in request_overrides: request_overrides['timeout'] = timeout
if 'cookies' not in request_overrides: request_overrides['cookies'] = cookies
(resp := requests.get(url, stream=True, **request_overrides)).raise_for_status()
m = re.search(r"\.([a-z0-9]{2,5})(?:\?|$)", url, re.I)
fd, path = tempfile.mkstemp(suffix="."+(m.group(1).lower() if m else "bin"))
with os.fdopen(fd, "wb") as fp:
for ch in resp.iter_content(32768):
if ch: fp.write(ch)
return path
'''timestamp'''
@staticmethod
def timestamp(t):
t = max(0.0, float(t)); mm = int(t//60); ss = t - mm*60
return f"[{mm:02d}:{ss:05.2f}]"
'''fromurl'''
def fromurl(self, url: str, transcribe_overrides: dict = None, headers: dict = None, timeout: int = 300, cookies: dict = None, request_overrides: dict = None):
assert self.whisper_model is not None, 'faster_whisper should be installed via "pip install "faster_whisper"'
transcribe_overrides, headers, cookies, request_overrides, tmp_file_path = transcribe_overrides or {}, headers or {}, cookies or {}, request_overrides or {}, ''
try:
tmp_file_path = self.downloadtotmpdir(url, headers=headers, timeout=timeout, cookies=cookies, request_overrides=request_overrides)
(default_transcribe_settings := {'language': None, 'vad_filter': True, 'vad_parameters': dict(min_silence_duration_ms=300), 'chunk_length': 30, 'beam_size': 5}).update(transcribe_overrides)
segs, info = self.whisper_model.transcribe(tmp_file_path, **default_transcribe_settings)
lrc = "\n".join(f"{self.timestamp(s.start)}{s.text.strip()}" for s in segs)
result = {"language": info.language, "prob": info.language_probability, "duration": getattr(info, "duration", None), 'lyric': lrc}
return result
finally:
if tmp_file_path and os.path.exists(tmp_file_path): os.remove(tmp_file_path)
'''fromfilepath'''
def fromfilepath(self, file_path: str, transcribe_overrides: dict = None):
assert self.whisper_model is not None, 'faster_whisper should be installed via "pip install "faster_whisper"'
transcribe_overrides = transcribe_overrides or {}
default_transcribe_settings = {'language': None, 'vad_filter': True, 'vad_parameters': dict(min_silence_duration_ms=300), 'chunk_length': 30, 'beam_size': 5}
default_transcribe_settings.update(transcribe_overrides)
segs, info = self.whisper_model.transcribe(file_path, **default_transcribe_settings)
lrc = "\n".join(f"{self.timestamp(s.start)}{s.text.strip()}" for s in segs)
result = {"language": info.language, "prob": info.language_probability, "duration": getattr(info, "duration", None), 'lyric': lrc}
return result
'''LyricSearchClient'''
class LyricSearchClient():
'''search'''
@staticmethod
def search(track_name: str, artist_name: str, allowed_lyric_apis: tuple = ('searchbylrclibapig', 'searchbylrclibapis'), request_overrides: dict = None):
lyric_result, lyric = {}, 'NULL'
for lyric_api in allowed_lyric_apis:
if not callable(lyric_api): lyric_api = getattr(LyricSearchClient, lyric_api, None)
try: lyric_result, lyric = lyric_api(track_name=track_name, artist_name=artist_name, request_overrides=request_overrides)
except Exception: lyric_result, lyric = {}, 'NULL'
if lyric and (lyric not in {'NULL', 'None'}): return lyric_result, lyric
return lyric_result, lyric
'''searchbylrclibapig'''
@staticmethod
def searchbylrclibapig(track_name: str, artist_name: str, request_overrides: dict = None):
request_overrides = request_overrides or {}; headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36"}
(resp := requests.get("https://lrclib.net/api/get", params={"artist_name": artist_name, "track_name": track_name}, headers=headers, timeout=10, **request_overrides)).raise_for_status()
lyric = cleanlrc((lyric_result := resp2json(resp=resp)).get('syncedLyrics') or lyric_result.get('plainLyrics') or 'NULL')
return lyric_result, lyric
'''searchbylrclibapis'''
@staticmethod
def searchbylrclibapis(track_name: str, artist_name: str, request_overrides: dict = None):
request_overrides = request_overrides or {}; headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36"}
(resp := requests.get("https://lrclib.net/api/search", params={"q": f"{artist_name} {track_name}"}, headers=headers, timeout=10, **request_overrides)).raise_for_status()
lyric = cleanlrc((lyric_result := resp2json(resp=resp))[0].get('syncedLyrics') or lyric_result[0].get('plainLyrics') or 'NULL')
return lyric_result, lyric
'''searchbylyricsovhapiv1'''
@staticmethod
def searchbylyricsovhapiv1(track_name: str, artist_name: str, request_overrides: dict = None):
request_overrides = request_overrides or {}; headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36"}
(resp := requests.get(f"https://api.lyrics.ovh/v1/{quote(artist_name, safe='')}/{quote(track_name, safe='')}", headers=headers, timeout=10, **request_overrides))
lyric = cleanlrc((lyric_result := resp2json(resp=resp)).get('lyrics') or 'NULL')
return lyric_result, lyric
'''searchbyhappiapiv1'''
@staticmethod
def searchbyhappiapiv1(track_name: str, artist_name: str, request_overrides: dict = None):
request_overrides = request_overrides or {}; headers = {'accept': 'application/json', 'x-happi-token': 'hk254-C1VegxwlJjYdYFPtdUDpg8qiVpmAXVl0aA'}
(resp := requests.get('https://api.happi.dev/v1/lyrics', params={'artist': artist_name, 'track': track_name}, headers=headers, timeout=10, **request_overrides))
lyric = cleanlrc((lyric_result := resp2json(resp=resp))['result'][0]['lyrics'] or 'NULL')
return lyric_result, lyric
'''searchbymusixmatchapi'''
@staticmethod
def searchbymusixmatchapi(track_name: str, artist_name: str, request_overrides: dict = None):
candidate_req_keys = ['3bc1042fde1ac8c1979c400d6f921320']
request_overrides = request_overrides or {}; headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/145.0.0.0 Safari/537.36"}
(resp := requests.get(f"https://api.musixmatch.com/ws/1.1/matcher.lyrics.get?apikey={random.choice(candidate_req_keys)}&q_track={track_name}&q_artist={artist_name}", headers=headers, timeout=10, **request_overrides))
lyric = cleanlrc((lyric_result := resp2json(resp=resp))['message']['body']['lyrics']['lyrics_body'] or 'NULL')
return lyric_result, lyric