Files
musicdl-catalog-sync-suite/catalog-sync/musicdl/modules/thirdpartysites/kkws.py
T

122 lines
9.6 KiB
Python

'''
Function:
Implementation of KKWSMusicClient: https://www.kkws.cc/
Author:
Zhenchao Jin
WeChat Official Account (微信公众号):
Charles的皮卡丘
'''
import re
import functools
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from rich.progress import Progress
from ..sources import BaseMusicClient
from ..utils import legalizestring, usesearchheaderscookies, seconds2hms, searchdictbykey, safeextractfromdict, extractdurationsecondsfromlrc, resp2json, cleanlrc, SongInfo, QuarkParser, AudioLinkTester
'''KKWSMusicClient'''
class KKWSMusicClient(BaseMusicClient):
source = 'KKWSMusicClient'
MUSIC_QUALITY_RANK = {"DSD": 100, "DSF": 100, "DFF": 100, "WAV": 95, "AIFF": 95, "FLAC": 90, "ALAC": 90, "APE": 88, "WV": 88, "OPUS": 70, "AAC": 65, "M4A": 65, "OGG": 60, "VORBIS": 60, "MP3": 50, "WMA": 45}
def __init__(self, **kwargs):
super(KKWSMusicClient, self).__init__(**kwargs)
assert self.quark_parser_config.get('cookies'), f'{self.source}.__init__ >>> "quark_parser_config" is not configured, so the songs cannot be downloaded.'
self.default_search_headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"}
self.default_download_headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"}
self.default_headers = self.default_search_headers
self._initsession()
'''_constructsearchurls'''
def _constructsearchurls(self, keyword: str, rule: dict = None, request_overrides: dict = None):
# init
rule, request_overrides = rule or {}, request_overrides or {}
# construct search urls
self.search_size_per_page = min(self.search_size_per_source, 15)
search_urls, page_size, count = [], self.search_size_per_page, 0
while self.search_size_per_source > count:
search_urls.append(f'https://www.kkws.cc/search.html?key={keyword}&page={int(count // page_size) + 1}')
count += page_size
# return
return search_urls
'''_parsesearchresultsfromhtml'''
def _parsesearchresultsfromhtml(self, html_text: str):
soup = BeautifulSoup(html_text, "lxml")
search_results, base_url = [], 'https://www.kkws.cc/'
for li in soup.select("ul.listbox > li"):
if not (a := li.select_one("h2 a[href]")): continue
title_attr = (a.get("title") or "").strip(); full_text = a.get_text(" ", strip=True)
name = title_attr.replace("免费下载", "").strip() if title_attr else full_text; name = re.sub(r"\s*\[[^\]]+\]\s*", " ", name).strip(); name = re.sub(r"\s*-\s*\d+(\.\d+)?[KMG]?\s*$", "", name).strip()
m_fmt = re.search(r"\[([^\]]+)\]", full_text); file_format = m_fmt.group(1).strip() if m_fmt else ""
m_size = re.search(r"-\s*([0-9.]+\s*[KMG]?)", full_text, re.IGNORECASE); size = (m_size.group(1).replace(" ", "") if m_size else "").strip()
ems = li.select("small em"); share_time, singer = "", ""
if len(ems) >= 1: share_time = ems[0].get_text(strip=True).replace("分享时间:", "").strip()
if len(ems) >= 2: singer = ems[-1].get_text(strip=True).replace("演唱:", "").strip()
m_id = re.search(r"/detail/(\d+)\.html", (href := urljoin(base_url, a["href"]))); item_id = m_id.group(1) if m_id else ""
search_results.append({"id": item_id, "name": name, "format": file_format, "size": size, "share_time": share_time, "singer": singer, "detail_url": href})
return search_results
'''_extractlyricsandquark'''
def _extractlyricsandquark(self, html_text: str, song_id: str, request_overrides: dict = None):
request_overrides = request_overrides or {}
tb = (soup := BeautifulSoup(html_text, "lxml")).select_one("#textbox")
to_mmss_func = lambda t: (lambda s: f"{s//60:02d}:{s%60:02d}")(int(float(t.split(":",1)[0])*60+float(t.split(":",1)[1])) if ":" in t else int(float(t)))
lyrics = "" if not tb else "\n".join((f"[{to_mmss_func(m.group(1))}] {m.group(2).strip()}" if (m:=re.match(r"^\[(\d+(?:\.\d+)?|\d{1,2}:\d{2}(?:\.\d+)?)\]\s*(.*)$", line)) else f"{line}") for line in (l.strip() for l in tb.get_text("\n").splitlines()) if line)
url_map, rank = {}, KKWSMusicClient.MUSIC_QUALITY_RANK
for a in soup.select("div.downbox a[onclick]"):
if not (onclick := (a.get("onclick") or "").strip()): continue
args = re.findall(r"'([^']*)'", onclick); name = fmt = url = None
if (parsed := ((args[1], args[2], args[3] or None) if onclick.startswith("openModel") and len(args) >= 4 else (args[1], args[2], None) if onclick.startswith("mbgotourl") and len(args) >= 3 else None)) is None: continue
name_fmt, url, fmt = parsed; name, fmt = ((lambda n, f2: (n, fmt or f2))(*map(str.strip, name_fmt.split("|", 1))) if "|" in name_fmt else (name_fmt.strip(), fmt))
try: url = resp2json(self.get(f'https://www.kkws.cc/getdown?url={url}&j=1&id={song_id}', allow_redirects=True, **request_overrides))['data']['decrypted_url']
except Exception: url = url
if not (url and "pan.quark.cn" in url): continue
e = url_map.setdefault(url, {"url": url, "formats": set(), "names": set()})
if fmt: e["formats"].add(functools.reduce(lambda f, k: k if k.lower() in f.lower() else f, rank, fmt))
if name: e["names"].add(name)
quark_links = sorted(({"url": e["url"], "formats": sorted(e["formats"]), "names": sorted(e["names"])} for e in url_map.values()), key=lambda x: rank.get(x["formats"][0] if x["formats"] else "UNKNOWN", 0), reverse=True)
quark_links = [q for q in quark_links if isinstance(q, dict) and q.get('url')]
return {"lyrics": lyrics, "quark_links": quark_links}
'''_search'''
@usesearchheaderscookies
def _search(self, keyword: str = '', search_url: str = '', request_overrides: dict = None, song_infos: list = [], progress: Progress = None, progress_id: int = 0):
# init
request_overrides = request_overrides or {}
# successful
try:
# --search results
(resp := self.get(search_url, **request_overrides)).raise_for_status()
search_results = self._parsesearchresultsfromhtml(resp.text)
for search_result in search_results:
# --download results
if not isinstance(search_result, dict) or ('detail_url' not in search_result) or ('id' not in search_result): continue
song_info = SongInfo(source=self.source)
try: (resp := self.get(search_result['detail_url'], **request_overrides)).raise_for_status(); download_result = self._extractlyricsandquark(resp.text, search_result['id'], request_overrides)
except Exception: continue
for quark_info in download_result['quark_links']:
download_result['quark_parse_result'], download_url = QuarkParser.parsefromurl(quark_info['url'], **self.quark_parser_config)
if not download_url or not str(download_url).startswith('http'): continue
duration = [int(float(d)) for d in searchdictbykey(download_result, 'duration') if int(float(d)) > 0]
duration_in_secs = duration[0] if duration else 0
song_info = SongInfo(
raw_data={'search': search_result, 'download': download_result, 'lyric': {}}, source=self.source, song_name=legalizestring(search_result.get('name')), singers=legalizestring(search_result.get('singer')), album='NULL',
ext='mp3', file_size_bytes=None, file_size=None, identifier=search_result['id'], duration_s=duration_in_secs, duration=seconds2hms(duration_in_secs), lyric=cleanlrc(safeextractfromdict(download_result, ['lyrics'], '')),
cover_url=None, download_url=download_url, download_url_status=self.quark_audio_link_tester.test(download_url, request_overrides), default_download_headers=self.quark_default_download_headers,
)
song_info.download_url_status['probe_status'] = self.quark_audio_link_tester.probe(song_info.download_url, request_overrides)
song_info.file_size = song_info.download_url_status['probe_status']['file_size']; song_info.ext = song_info.download_url_status['probe_status']['ext']
if (song_info.ext not in AudioLinkTester.VALID_AUDIO_EXTS) and (song_info.download_url_status['probe_status']['ext'] in AudioLinkTester.VALID_AUDIO_EXTS): song_info.ext = song_info.download_url_status['probe_status']['ext']
elif (song_info.ext not in AudioLinkTester.VALID_AUDIO_EXTS): song_info.ext = 'mp3'
if song_info.with_valid_download_url: break
if not song_info.lyric or '歌词获取失败' in song_info.lyric: song_info.lyric = 'NULL'
if not song_info.duration or song_info.duration == '-:-:-': song_info.duration = seconds2hms(extractdurationsecondsfromlrc(song_info.lyric))
if not song_info.with_valid_download_url: continue
# --append to song_infos
song_infos.append(song_info)
# --judgement for search_size
if self.strict_limit_search_size_per_page and len(song_infos) >= self.search_size_per_page: break
# --update progress
progress.update(progress_id, description=f"{self.source}.search >>> {search_url} (Success)")
# failure
except Exception as err:
progress.update(progress_id, description=f"{self.source}.search >>> {search_url} (Error: {err})")
# return
return song_infos