''' Function: Implementation of GequhaiMusicClient: https://www.gequhai.com/ Author: Zhenchao Jin WeChat Official Account (微信公众号): Charles的皮卡丘 ''' import re import base64 import json_repair from bs4 import BeautifulSoup from rich.progress import Progress from ..sources import BaseMusicClient from urllib.parse import urljoin, urlparse from ..utils import legalizestring, usesearchheaderscookies, resp2json, safeextractfromdict, extractdurationsecondsfromlrc, seconds2hms, searchdictbykey, cleanlrc, SongInfo, QuarkParser, AudioLinkTester '''GequhaiMusicClient''' class GequhaiMusicClient(BaseMusicClient): source = 'GequhaiMusicClient' def __init__(self, **kwargs): super(GequhaiMusicClient, self).__init__(**kwargs) if not self.quark_parser_config.get('cookies'): self.logger_handle.warning(f'{self.source}.__init__ >>> "quark_parser_config" is not configured, so song downloads are restricted and only mp3 files can be downloaded.') self.default_search_headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"} self.default_download_headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"} self.default_headers = self.default_search_headers self._initsession() '''_constructsearchurls''' def _constructsearchurls(self, keyword: str, rule: dict = None, request_overrides: dict = None): # init rule, request_overrides = rule or {}, request_overrides or {} # construct search urls self.search_size_per_page = min(self.search_size_per_source, 12) search_urls, page_size, count = [], self.search_size_per_page, 0 while self.search_size_per_source > count: if int(count // page_size) + 1 == 1: search_urls.append(f'https://www.gequhai.com/s/{keyword}') else: search_urls.append(f'https://www.gequhai.com/s/{keyword}?page={int(count // page_size) + 1}') count += page_size # return return search_urls '''_parsesearchresultsfromhtml''' def _parsesearchresultsfromhtml(self, html_text: str): soup, base_url, search_results = BeautifulSoup(html_text, "html.parser"), "https://www.gequhai.com", [] if not (table := soup.select_one("table#myTables")): return [] for tr in table.select("tbody tr"): if len((tds := tr.find_all("td"))) < 3: continue idx_text = tds[0].get_text(strip=True); a = tds[1].find("a") title = a.get_text(strip=True) if a else tds[1].get_text(strip=True) href: str = a.get("href", "") if a else ""; play_url = urljoin(base_url, href) if href else "" singer = tds[2].get_text(strip=True); m = re.search(r"/play/(\d+)", href or ""); play_id = m.group(1) if m else None search_results.append({"index": int(idx_text) if idx_text.isdigit() else idx_text, "title": title, "singer": singer, "href": href, "play_url": play_url, "play_id": play_id}) return search_results '''_decodequarkurl''' def _decodequarkurl(self, quark_url: str): return base64.b64decode(quark_url.replace("#", "H")).decode("utf-8", errors="strict") '''_extractappdataandwindowvars''' def _extractappdataandwindowvars(self, js_text: str) -> dict: out, m = {}, re.search(r"window\.appData\s*=\s*(\{.*?\})\s*;", js_text, flags=re.S) if m: app = json_repair.loads(m.group(1)); out["appData"] = app; out.update(app) for k, v in re.findall(r"window\.(\w+)\s*=\s*'([^']*)'\s*;", js_text): out[k] = v for k, v in re.findall(r'window\.(\w+)\s*=\s*"([^"]*)"\s*;', js_text): out[k] = v seen = set(out); out.update({k: int(v) if re.fullmatch(r"-?\d+", v) else float(v) for k, v in re.findall(r"window\.(\w+)\s*=\s*(-?\d+(?:\.\d+)?)\s*;", js_text) if not (k in seen or seen.add(k))}) seen = set(out); out.update({k: {"true": True, "false": False, "null": None}[str(v).lower()] for k, v in re.findall(r"window\.(\w+)\s*=\s*(true|false|null)\s*;", js_text, flags=re.I) if not (k in seen or seen.add(k))}) if "mp3_title" in out and "mp3_author" in out: out.setdefault("mp3_name", f"{out['mp3_title']}-{out['mp3_author']}") if "mp3_extra_url" in out: out["mp3_extra_url_decoded"] = self._decodequarkurl(out["mp3_extra_url"]) return out '''_parsesearchresultfromquark''' def _parsesearchresultfromquark(self, search_result: dict, download_result: dict, soup: BeautifulSoup, request_overrides: dict = None): # init request_overrides, song_info = request_overrides or {}, SongInfo(source=self.source) # parse quark_download_url = download_result.get('mp3_extra_url_decoded', '') if not quark_download_url or not str(quark_download_url).startswith('http'): return song_info download_result['quark_parse_result'], download_url = QuarkParser.parsefromdirurl(quark_download_url, **self.quark_parser_config) duration = [int(float(d)) for d in searchdictbykey(download_result, 'duration') if int(float(d)) > 0]; duration_in_secs = duration[0] if duration else 0 song_info = SongInfo( raw_data={'search': search_result, 'download': download_result, 'lyric': {}}, source=self.source, song_name=legalizestring(download_result.get('mp3_title')), singers=legalizestring(download_result.get('mp3_author', None)), album='NULL', ext='mp3', file_size=None, identifier=download_result.get('mp3_id') or urlparse(str(search_result['play_url'])).path.strip('/').split('/')[-1], duration_s=duration_in_secs, duration=seconds2hms(duration_in_secs), lyric=cleanlrc(soup.find("div", id="content-lrc2").get_text("\n", strip=True)), cover_url=download_result.get('mp3_cover'), download_url=download_url, download_url_status=self.quark_audio_link_tester.test(download_url, request_overrides), default_download_headers=self.quark_default_download_headers, ) song_info.download_url_status['probe_status'] = self.quark_audio_link_tester.probe(song_info.download_url, request_overrides) song_info.file_size = song_info.download_url_status['probe_status']['file_size']; song_info.ext = song_info.download_url_status['probe_status']['ext'] if (song_info.ext not in AudioLinkTester.VALID_AUDIO_EXTS) and (song_info.download_url_status['probe_status']['ext'] in AudioLinkTester.VALID_AUDIO_EXTS): song_info.ext = song_info.download_url_status['probe_status']['ext'] elif (song_info.ext not in AudioLinkTester.VALID_AUDIO_EXTS): song_info.ext = 'mp3' if not song_info.with_valid_download_url: return SongInfo(source=self.source) if not song_info.lyric or '歌词获取失败' in song_info.lyric: song_info.lyric = 'NULL' if not song_info.duration or song_info.duration == '-:-:-': song_info.duration = seconds2hms(extractdurationsecondsfromlrc(song_info.lyric)) # return return song_info '''_parsesearchresultfromweb''' def _parsesearchresultfromweb(self, search_result: dict, download_result: dict, soup: BeautifulSoup, request_overrides: dict = None): # init request_overrides, song_info = request_overrides or {}, SongInfo(source=self.source) # parse if 'play_id' not in download_result or not download_result['play_id']: return song_info headers = { "accept": "application/json, text/javascript, */*; q=0.01", "accept-encoding": "gzip, deflate, br, zstd", "accept-language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", "content-type": "application/x-www-form-urlencoded; charset=UTF-8", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "sec-ch-ua": "\"Google Chrome\";v=\"143\", \"Chromium\";v=\"143\", \"Not A(Brand\";v=\"24\"", "x-custom-header": "SecretKey", "x-requested-with": "XMLHttpRequest", "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"Windows\"", "sec-fetch-dest": "empty", "origin": "https://www.gequhai.com", "priority": "u=1, i", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36", } (resp := self.post('https://www.gequhai.com/api/music', data={'id': download_result['play_id'], 'type': '0'}, headers=headers, **request_overrides)).raise_for_status() download_result['api/music'] = resp2json(resp=resp); download_url = safeextractfromdict(download_result['api/music'], ['data', 'url'], '') if not download_url or not str(download_url).startswith('http'): return song_info song_info = SongInfo( raw_data={'search': search_result, 'download': download_result, 'lyric': {}}, source=self.source, song_name=legalizestring(download_result.get('mp3_title')), singers=legalizestring(download_result.get('mp3_author')), album='NULL', ext=download_url.split('?')[0].split('.')[-1], file_size=None, identifier=download_result.get('mp3_id') or urlparse(str(search_result['play_url'])).path.strip('/').split('/')[-1], duration='-:-:-', lyric=cleanlrc(soup.find("div", id="content-lrc2").get_text("\n", strip=True)), cover_url=download_result.get('mp3_cover'), download_url=download_url, download_url_status=self.audio_link_tester.test(download_url, request_overrides), ) song_info.download_url_status['probe_status'] = self.audio_link_tester.probe(song_info.download_url, request_overrides) song_info.file_size = song_info.download_url_status['probe_status']['file_size'] if (song_info.ext not in AudioLinkTester.VALID_AUDIO_EXTS) and (song_info.download_url_status['probe_status']['ext'] in AudioLinkTester.VALID_AUDIO_EXTS): song_info.ext = song_info.download_url_status['probe_status']['ext'] elif (song_info.ext not in AudioLinkTester.VALID_AUDIO_EXTS): song_info.ext = 'mp3' if not song_info.with_valid_download_url: return SongInfo(source=self.source) if not song_info.lyric or '歌词获取失败' in song_info.lyric: song_info.lyric = 'NULL' if not song_info.duration or song_info.duration == '-:-:-': song_info.duration = seconds2hms(extractdurationsecondsfromlrc(song_info.lyric)) # return return song_info '''_search''' @usesearchheaderscookies def _search(self, keyword: str = '', search_url: str = '', request_overrides: dict = None, song_infos: list = [], progress: Progress = None, progress_id: int = 0): # init request_overrides = request_overrides or {} # successful try: # --search results (resp := self.get(search_url, **request_overrides)).raise_for_status() search_results = self._parsesearchresultsfromhtml(resp.text) for search_result in search_results: # --download results if not isinstance(search_result, dict) or ('play_url' not in search_result): continue song_info = SongInfo(source=self.source) # ----fetch basic information try: (resp := self.get(search_result['play_url'], **request_overrides)).raise_for_status(); download_result = self._extractappdataandwindowvars(resp.text) except Exception: continue soup = BeautifulSoup(resp.text, 'lxml') # ----parse from quark links if self.quark_parser_config.get('cookies'): song_info = self._parsesearchresultfromquark(search_result, download_result, soup, request_overrides) # ----parse from play url if not song_info.with_valid_download_url: song_info = self._parsesearchresultfromweb(search_result, download_result, soup, request_overrides) # ----filter if invalid if not song_info.with_valid_download_url: continue # --append to song_infos song_infos.append(song_info) # --judgement for search_size if self.strict_limit_search_size_per_page and len(song_infos) >= self.search_size_per_page: break # --update progress progress.update(progress_id, description=f"{self.source}.search >>> {search_url} (Success)") # failure except Exception as err: progress.update(progress_id, description=f"{self.source}.search >>> {search_url} (Error: {err})") # return return song_infos