''' Function: Implementation of BaseMusicClient Author: Zhenchao Jin WeChat Official Account (微信公众号): Charles的皮卡丘 ''' from __future__ import annotations import os import re import copy import random import pickle import requests from pathlib import Path from threading import Lock from rich.text import Text from itertools import chain from datetime import datetime from collections import defaultdict from pathvalidate import sanitize_filepath from concurrent.futures import ThreadPoolExecutor, as_completed from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn, DownloadColumn, TransferSpeedColumn, TimeRemainingColumn, MofNCompleteColumn, ProgressColumn, Task from ..utils import LoggerHandle, AudioLinkTester, SongInfo, SongInfoUtils, HLSDownloader, touchdir, usedownloadheaderscookies, usesearchheaderscookies, useparseheaderscookies, cookies2dict, cookies2string, shortenpathsinsonginfos, optionalimport, optionalimportfrom DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36" def build_user_agent() -> str: try: user_agent_cls = optionalimportfrom("fake_useragent", "UserAgent") if user_agent_cls is not None: return user_agent_cls().random except Exception: pass return DEFAULT_USER_AGENT '''AudioAwareColumn''' class AudioAwareColumn(ProgressColumn): def __init__(self): super(AudioAwareColumn, self).__init__() self._download_col = DownloadColumn() '''render''' def render(self, task: Task): kind = task.fields.get("kind", "download") if kind == "overall": completed = int(task.completed); total = int(task.total) if task.total is not None else 0; return Text(f"{completed}/{total} audios") elif kind == "hls": completed = int(task.completed); total = int(task.total) if task.total is not None else 0; return Text(f"{completed}/{total} segments") else: return self._download_col.render(task) '''BaseMusicClient''' class BaseMusicClient(): source = 'BaseMusicClient' def __init__(self, search_size_per_source: int = 5, auto_set_proxies: bool = False, random_update_ua: bool = False, enable_search_curl_cffi: bool = False, enable_parse_curl_cffi: bool = False, enable_download_curl_cffi: bool = False, maintain_session: bool = False, logger_handle: LoggerHandle = None, disable_print: bool = False, work_dir: str = 'musicdl_outputs', max_retries: int = 3, freeproxy_settings: dict = None, default_search_cookies: dict | str = None, default_download_cookies: dict | str = None, default_parse_cookies: dict | str = None, strict_limit_search_size_per_page: bool = True, search_size_per_page: int = 10, quark_parser_config: dict = None): # set up work dir touchdir(work_dir) # set attributes self.search_size_per_source = search_size_per_source self.auto_set_proxies = auto_set_proxies self.random_update_ua = random_update_ua self.max_retries = max_retries self.maintain_session = maintain_session self.logger_handle = logger_handle if logger_handle else LoggerHandle() self.disable_print = disable_print self.work_dir = work_dir self.freeproxy_settings = freeproxy_settings or {} self.quark_parser_config = quark_parser_config or {} self.default_search_cookies = cookies2dict(default_search_cookies); self.default_download_cookies = cookies2dict(default_download_cookies); self.default_parse_cookies = cookies2dict(default_parse_cookies); self.default_cookies = self.default_search_cookies self.search_size_per_page = min(search_size_per_source, search_size_per_page); self.strict_limit_search_size_per_page = strict_limit_search_size_per_page self.enable_search_curl_cffi = enable_search_curl_cffi; self.enable_download_curl_cffi = enable_download_curl_cffi; self.enable_parse_curl_cffi = enable_parse_curl_cffi; self.enable_curl_cffi = self.enable_search_curl_cffi self.cc_impersonates = self._listccimpersonates() if (enable_search_curl_cffi or enable_download_curl_cffi) else None # init requests.Session self.default_search_headers = {'User-Agent': build_user_agent()}; self.default_download_headers = {'User-Agent': build_user_agent()}; self.default_parse_headers = {'User-Agent': build_user_agent()} self.quark_default_download_headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36 Core/1.94.225.400 QQBrowser/12.2.5544.400', 'origin': 'https://pan.quark.cn', 'referer': 'https://pan.quark.cn/', 'accept-language': 'zh-CN,zh;q=0.9', 'cookie': cookies2string(self.quark_parser_config.get('cookies', '')), } self.quark_default_download_cookies = {} # placeholder, useless now self.default_headers = self.default_search_headers self._initsession() # proxied_session_client freeproxy = optionalimportfrom('freeproxy', 'freeproxy') (default_freeproxy_settings := dict(disable_print=True, proxy_sources=['ProxiflyProxiedSession'], max_tries=20, init_proxied_session_cfg={})).update(self.freeproxy_settings) self.proxied_session_client = freeproxy.ProxiedSessionClient(**default_freeproxy_settings) if auto_set_proxies else None '''_listccimpersonates''' def _listccimpersonates(self): curl_cffi = optionalimport('curl_cffi') root = Path(curl_cffi.__file__).resolve().parent exts = {".py", ".so", ".pyd", ".dll", ".dylib"} pat = re.compile(rb"\b(?:chrome|edge|safari|firefox|tor)(?:\d+[a-z_]*|_android|_ios)?\b") return sorted({m.decode("utf-8", "ignore") for p in root.rglob("*") if p.suffix in exts for m in pat.findall(p.read_bytes())}) '''_initsession''' def _initsession(self): if self.maintain_session and getattr(self, 'session', None) and getattr(self, 'audio_link_tester', None) and getattr(self, 'quark_audio_link_tester', None): return curl_cffi = optionalimport('curl_cffi') self.session = requests.Session() if not self.enable_curl_cffi else curl_cffi.requests.Session() self.session.headers = self.default_headers self.audio_link_tester = AudioLinkTester(headers=copy.deepcopy(self.default_download_headers), cookies=copy.deepcopy(self.default_download_cookies)) self.quark_audio_link_tester = AudioLinkTester(headers=copy.deepcopy(self.quark_default_download_headers), cookies=copy.deepcopy(self.quark_default_download_cookies)) '''_constructsearchurls''' def _constructsearchurls(self, keyword: str, rule: dict = None, request_overrides: dict = None): raise NotImplementedError('not to be implemented') '''_constructuniqueworkdir''' def _constructuniqueworkdir(self, keyword: str, sort_by_search_kwd_and_time: bool = True): time_stamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") touchdir((work_dir := sanitize_filepath(os.path.join(self.work_dir, self.source, f'{time_stamp} {keyword}') if sort_by_search_kwd_and_time else os.path.join(self.work_dir, self.source)))) return work_dir '''_removeduplicates''' def _removeduplicates(self, song_infos: list[SongInfo] = None) -> list[SongInfo]: unique_song_infos, identifiers = [], set() for song_info in song_infos: if song_info.identifier in identifiers: continue identifiers.add(song_info.identifier); unique_song_infos.append(song_info) return unique_song_infos '''_search''' @usesearchheaderscookies def _search(self, keyword: str = '', search_url: str = '', request_overrides: dict = None, song_infos: list = [], progress: Progress = None, progress_id: int = 0): raise NotImplementedError('not be implemented') '''search''' @usesearchheaderscookies def search(self, keyword: str, num_threadings: int = 5, request_overrides: dict = None, rule: dict = None, main_process_context: Progress = None, main_progress_id: int = None, main_progress_lock: Lock = None): # init rule, request_overrides = rule or {}, request_overrides or {} # logging self.logger_handle.info(f'Start to search music files using {self.source}.', disable_print=self.disable_print) # construct search urls search_urls = self._constructsearchurls(keyword=keyword, rule=rule, request_overrides=request_overrides) # multi threadings for searching music files if main_process_context is None: owns_progress = True; main_process_context = Progress(TextColumn("{task.description}"), BarColumn(bar_width=None), MofNCompleteColumn(), TimeRemainingColumn(), refresh_per_second=10); main_process_context.__enter__() else: owns_progress = False if main_progress_lock is None: main_progress_lock = Lock() with main_progress_lock: progress_id = main_process_context.add_task(f"{self.source}.search >>> completed (0/{len(search_urls)})", total=len(search_urls)) if main_progress_id is not None: cur_total = main_process_context.tasks[main_progress_id].total or 0 main_process_context.update(main_progress_id, total=cur_total + len(search_urls)) main_process_context.update(main_progress_id, description=f"Search from sources >>> completed ({int(main_process_context.tasks[main_progress_id].completed)}/{cur_total + len(search_urls)})") song_infos, submitted_tasks = {}, [] with ThreadPoolExecutor(max_workers=num_threadings) as pool: for search_url_idx, search_url in enumerate(search_urls): song_infos[str(search_url_idx)] = [] submitted_tasks.append(pool.submit(self._search, keyword, search_url, request_overrides, song_infos[str(search_url_idx)], main_process_context, progress_id)) for future in as_completed(submitted_tasks): future.result() with main_progress_lock: main_process_context.advance(progress_id, 1) num_searched_urls = int(main_process_context.tasks[progress_id].completed) main_process_context.update(progress_id, description=f"{self.source}.search >>> completed ({num_searched_urls}/{len(search_urls)})") if main_progress_id is None: continue main_process_context.advance(main_progress_id, 1) main_process_context.update(main_progress_id, description=f"Search from sources >>> completed ({int(main_process_context.tasks[main_progress_id].completed)}/{int(main_process_context.tasks[main_progress_id].total or 0)})") song_infos = list(chain.from_iterable(song_infos.values())); song_infos = self._removeduplicates(song_infos=song_infos) work_dir = self._constructuniqueworkdir(keyword=keyword) for song_info in song_infos: song_info.work_dir = work_dir; episodes = song_info.episodes if isinstance(song_info.episodes, list) else [] for eps_info in episodes: eps_info.work_dir = sanitize_filepath(os.path.join(work_dir, song_info.song_name)); touchdir(work_dir) # logging if len(song_infos) > 0: work_dir_to_song_info, work_dir = defaultdict(list), ', '.join(list(set([str(s.work_dir) for s in song_infos]))) for s in song_infos: s.work_dir = str(s.work_dir); work_dir_to_song_info[s.work_dir].append(s.todict()) for w, items in work_dir_to_song_info.items(): touchdir(w); self._savetopkl(items, os.path.join(w, "search_results.pkl")) else: work_dir = self.work_dir self.logger_handle.info(f'Finished searching music files using {self.source}. Search results have been saved to {work_dir}, valid items: {len(song_infos)}.', disable_print=self.disable_print) if owns_progress: main_process_context.__exit__(None, None, None) # return return song_infos '''_download''' @usedownloadheaderscookies def _download(self, song_info: SongInfo, request_overrides: dict = None, downloaded_song_infos: list[SongInfo] = [], progress: Progress = None, song_progress_id: int = 0, auto_supplement_song: bool = True): request_overrides = copy.deepcopy(request_overrides or {}) if song_info.protocol.upper() in {'HLS'}: try: hls_downloader = HLSDownloader( output_dir=song_info.work_dir, proxies=request_overrides.pop('proxies', {}) or self._autosetproxies(), headers=song_info.default_download_headers or request_overrides.pop('headers', {}) or self.default_headers, cookies=request_overrides.pop('cookies', {}) or self.default_cookies, logger_handle=self.logger_handle, verify_tls=request_overrides.pop('verify', True), timeout=request_overrides.pop('timeout', (10, 30)), disable_print=self.disable_print, request_overrides=request_overrides ) hls_downloader.download(song_info.download_url, song_info.save_path, quality='best', keep_segments=False, temp_subdir=str(song_info.identifier), progress=progress, progress_id=song_progress_id) downloaded_song_infos.append(SongInfoUtils.supplsonginfothensavelyricsthenwritetags(copy.deepcopy(song_info), logger_handle=self.logger_handle, disable_print=self.disable_print) if auto_supplement_song else copy.deepcopy(song_info)) except Exception as err: progress.update(song_progress_id, description=f"{self.source}.download >>> {song_info.song_name[:10] + '...' if len(song_info.song_name) > 13 else song_info.song_name[:13]} (Error: {err})") elif song_info.protocol.upper() in {'HTTP'} and song_info.downloaded_contents: try: touchdir(song_info.work_dir) total_size = song_info.downloaded_contents.__sizeof__() progress.update(song_progress_id, total=total_size) with open(song_info.save_path, "wb") as fp: fp.write(song_info.downloaded_contents) progress.advance(song_progress_id, total_size) progress.update(song_progress_id, description=f"{self.source}.download >>> {song_info.song_name[:10] + '...' if len(song_info.song_name) > 13 else song_info.song_name[:13]} (Success)") downloaded_song_infos.append(SongInfoUtils.supplsonginfothensavelyricsthenwritetags(copy.deepcopy(song_info), logger_handle=self.logger_handle, disable_print=self.disable_print) if auto_supplement_song else copy.deepcopy(song_info)) except Exception as err: progress.update(song_progress_id, description=f"{self.source}.download >>> {song_info.song_name[:10] + '...' if len(song_info.song_name) > 13 else song_info.song_name[:13]} (Error: {err})") elif song_info.protocol.upper() in {'HTTP'}: try: touchdir(song_info.work_dir) if song_info.default_download_headers: request_overrides['headers'] = song_info.default_download_headers with self.get(song_info.download_url, stream=True, **request_overrides) as resp: resp.raise_for_status() total_size, chunk_size, downloaded_size = int(resp.headers.get('content-length', 0)), song_info.get('chunk_size', 1024), 0 progress.update(song_progress_id, total=total_size) with open(song_info.save_path, "wb") as fp: for chunk in resp.iter_content(chunk_size=chunk_size): if not chunk: continue fp.write(chunk); downloaded_size = downloaded_size + len(chunk) if total_size > 0: downloading_text = "%0.2fMB/%0.2fMB" % (downloaded_size / 1024 / 1024, total_size / 1024 / 1024) else: progress.update(song_progress_id, total=downloaded_size); downloading_text = "%0.2fMB/%0.2fMB" % (downloaded_size / 1024 / 1024, downloaded_size / 1024 / 1024) progress.advance(song_progress_id, len(chunk)) progress.update(song_progress_id, description=f"{self.source}.download >>> {song_info.song_name[:10] + '...' if len(song_info.song_name) > 13 else song_info.song_name[:13]} (Downloading: {downloading_text})") progress.update(song_progress_id, description=f"{self.source}.download >>> {song_info.song_name[:10] + '...' if len(song_info.song_name) > 13 else song_info.song_name[:13]} (Success)") downloaded_song_infos.append(SongInfoUtils.supplsonginfothensavelyricsthenwritetags(copy.deepcopy(song_info), logger_handle=self.logger_handle, disable_print=self.disable_print) if auto_supplement_song else copy.deepcopy(song_info)) except Exception as err: progress.update(song_progress_id, description=f"{self.source}.download >>> {song_info.song_name[:10] + '...' if len(song_info.song_name) > 13 else song_info.song_name[:13]} (Error: {err})") return downloaded_song_infos '''download''' @usedownloadheaderscookies def download(self, song_infos: list[SongInfo], num_threadings: int = 5, request_overrides: dict = None, auto_supplement_song: bool = True): # init request_overrides = request_overrides or {}; shortenpathsinsonginfos(song_infos=song_infos) # logging self.logger_handle.info(f'Start to download music files using {self.source}.', disable_print=self.disable_print) # multi threadings for downloading music files columns = [SpinnerColumn(), TextColumn("{task.description}"), BarColumn(bar_width=None), TaskProgressColumn(), AudioAwareColumn(), TransferSpeedColumn(), TimeRemainingColumn()] with Progress(*columns, refresh_per_second=20, expand=True) as progress: songs_progress_id = progress.add_task(f"{self.source}.download >>> completed (0/{len(song_infos)})", total=len(song_infos), kind='overall') song_progress_ids, downloaded_song_infos, submitted_tasks = [], [], [] for _, song_info in enumerate(song_infos): desc = f"{self.source}.download >>> {song_info.song_name[:10] + '...' if len(song_info.song_name) > 13 else song_info.song_name[:13]} (Preparing)" song_progress_ids.append(progress.add_task(desc, total=None, kind='download')) with ThreadPoolExecutor(max_workers=num_threadings) as pool: for song_progress_id, song_info in zip(song_progress_ids, song_infos): submitted_tasks.append(pool.submit(self._download, song_info, request_overrides, downloaded_song_infos, progress, song_progress_id, auto_supplement_song)) for _ in as_completed(submitted_tasks): progress.advance(songs_progress_id, 1) num_downloaded_songs = int(progress.tasks[songs_progress_id].completed) progress.update(songs_progress_id, description=f"{self.source}.download >>> completed ({num_downloaded_songs}/{len(song_infos)})") # logging if len(downloaded_song_infos) > 0: work_dir_to_song_info, work_dir = defaultdict(list), ', '.join(list(set([str(s.work_dir) for s in downloaded_song_infos]))) for s in downloaded_song_infos: s.work_dir = str(s.work_dir); work_dir_to_song_info[s.work_dir].append(s.todict()) for w, items in work_dir_to_song_info.items(): touchdir(w); self._savetopkl(items, os.path.join(w, "download_results.pkl")) else: work_dir = self.work_dir self.logger_handle.info(f'Finished downloading music files using {self.source}. Download results have been saved to {work_dir}, valid downloads: {len(downloaded_song_infos)}.', disable_print=self.disable_print) # return return downloaded_song_infos '''parseplaylist''' @useparseheaderscookies def parseplaylist(self, playlist_url: str, request_overrides: dict = None): raise NotImplementedError(f'Not supported now to parse playlist from {self.source}') '''_autosetproxies''' def _autosetproxies(self): if not self.auto_set_proxies: return {} try: proxies = self.proxied_session_client.getrandomproxy() except Exception as err: self.logger_handle.error(f'{self.source}._autosetproxies >>> freeproxy lib failed to auto fetch proxies (Error: {err})', disable_print=self.disable_print); proxies = {} return proxies '''get''' def get(self, url, **kwargs): if 'cookies' not in kwargs: kwargs['cookies'] = self.default_cookies if 'timeout' not in kwargs: kwargs['timeout'] = (10, 30) if 'impersonate' not in kwargs and self.enable_curl_cffi: kwargs['impersonate'] = random.choice(self.cc_impersonates) resp = None for _ in range(self.max_retries): if not self.maintain_session: self._initsession() if self.random_update_ua: self.session.headers.update({'User-Agent': build_user_agent()}) proxies = kwargs.pop('proxies', None) or self._autosetproxies() try: (resp := self.session.get(url, proxies=proxies, **kwargs)).raise_for_status() except Exception as err: self.logger_handle.error(f'{self.source}.get >>> {url} (Error: {err}; status={getattr(locals().get("resp"), "status_code", None)})', disable_print=self.disable_print); continue return resp return resp '''post''' def post(self, url, **kwargs): if 'cookies' not in kwargs: kwargs['cookies'] = self.default_cookies if 'timeout' not in kwargs: kwargs['timeout'] = (10, 30) if 'impersonate' not in kwargs and self.enable_curl_cffi: kwargs['impersonate'] = random.choice(self.cc_impersonates) resp = None for _ in range(self.max_retries): if not self.maintain_session: self._initsession() if self.random_update_ua: self.session.headers.update({'User-Agent': build_user_agent()}) proxies = kwargs.pop('proxies', None) or self._autosetproxies() try: (resp := self.session.post(url, proxies=proxies, **kwargs)).raise_for_status() except Exception as err: self.logger_handle.error(f'{self.source}.post >>> {url} (Error: {err}; status={getattr(locals().get("resp"), "status_code", None)})', disable_print=self.disable_print); continue return resp return resp '''_savetopkl''' def _savetopkl(self, data, file_path, auto_sanitize=True): if auto_sanitize: file_path = sanitize_filepath(file_path) with open(file_path, 'wb') as fp: pickle.dump(data, fp)