|
@@ -4,38 +4,555 @@
|
|
|
@Contact : liuyuqi.gov@msn.cn
|
|
@Contact : liuyuqi.gov@msn.cn
|
|
|
@Time : 2024/07/30 19:02:15
|
|
@Time : 2024/07/30 19:02:15
|
|
|
@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
|
-@Desc :
|
|
|
|
|
|
|
+@Desc : Font downloader for fonts.net.cn and chinaz.com
|
|
|
'''
|
|
'''
|
|
|
|
|
+import os
|
|
|
|
|
+import re
|
|
|
|
|
+import time
|
|
|
|
|
+import zipfile
|
|
|
import requests
|
|
import requests
|
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
|
|
+from urllib.parse import urljoin, urlparse
|
|
|
|
|
+from tqdm import tqdm
|
|
|
|
|
+from typing import List, Dict, Optional, Tuple
|
|
|
|
|
+
|
|
|
|
|
|
|
|
class Font(object):
|
|
class Font(object):
|
|
|
- """docstring for Font"""
|
|
|
|
|
|
|
+ """Font downloader class"""
|
|
|
|
|
+
|
|
|
header = {
|
|
header = {
|
|
|
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
|
|
|
|
|
|
|
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
|
|
|
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
|
|
|
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
|
|
|
}
|
|
}
|
|
|
- def __init__(self):
|
|
|
|
|
|
|
+
|
|
|
|
|
+ FONTSNET_BASE = 'https://www.fonts.net.cn'
|
|
|
|
|
+ CHINAZ_BASE = 'https://font.chinaz.com'
|
|
|
|
|
+
|
|
|
|
|
+ def __init__(self, download_dir: str = './fonts'):
|
|
|
self.sess = requests.Session()
|
|
self.sess = requests.Session()
|
|
|
self.sess.headers.update(self.header)
|
|
self.sess.headers.update(self.header)
|
|
|
|
|
+ self.download_dir = download_dir
|
|
|
|
|
+ self._ensure_download_dir()
|
|
|
|
|
|
|
|
|
|
+ def _ensure_download_dir(self):
|
|
|
|
|
+ if not os.path.exists(self.download_dir):
|
|
|
|
|
+ os.makedirs(self.download_dir)
|
|
|
|
|
+
|
|
|
def run(self):
|
|
def run(self):
|
|
|
- pass
|
|
|
|
|
-
|
|
|
|
|
|
|
+ print("=" * 50)
|
|
|
|
|
+ print("字体下载工具")
|
|
|
|
|
+ print("=" * 50)
|
|
|
|
|
+ print("\n请选择要爬取的网站:")
|
|
|
|
|
+ print("1. 字体天下 (fonts.net.cn) - 商用免费字体")
|
|
|
|
|
+ print("2. 站长字体 (font.chinaz.com)")
|
|
|
|
|
+ print("3. 下载指定字体ID")
|
|
|
|
|
+ print("4. 退出")
|
|
|
|
|
+
|
|
|
|
|
+ choice = input("\n请输入选项 (1-4): ").strip()
|
|
|
|
|
+
|
|
|
|
|
+ if choice == '1':
|
|
|
|
|
+ self.crawl_fontsnet()
|
|
|
|
|
+ elif choice == '2':
|
|
|
|
|
+ self.crawl_chinaz()
|
|
|
|
|
+ elif choice == '3':
|
|
|
|
|
+ font_id = input("请输入字体ID (例如: 37476120124): ").strip()
|
|
|
|
|
+ if font_id:
|
|
|
|
|
+ self.download_font_by_id(font_id)
|
|
|
|
|
+ elif choice == '4':
|
|
|
|
|
+ print("退出程序")
|
|
|
|
|
+ return
|
|
|
|
|
+ else:
|
|
|
|
|
+ print("无效选项,退出程序")
|
|
|
|
|
+
|
|
|
def crawl_fontsnet(self):
|
|
def crawl_fontsnet(self):
|
|
|
- # download 532 商用字体
|
|
|
|
|
- self.sess.headers.update(
|
|
|
|
|
- {
|
|
|
|
|
- "Origin": "https://www.fonts.net.cn/"
|
|
|
|
|
- }
|
|
|
|
|
- )
|
|
|
|
|
|
|
+ print("\n" + "=" * 50)
|
|
|
|
|
+ print("开始爬取字体天下 (fonts.net.cn)")
|
|
|
|
|
+ print("=" * 50)
|
|
|
|
|
+
|
|
|
|
|
+ print("\n请选择爬取方式:")
|
|
|
|
|
+ print("1. 按分类爬取 (中文字体/英文字体/图形字体)")
|
|
|
|
|
+ print("2. 爬取商用免费字体")
|
|
|
|
|
+ print("3. 爬取最新字体")
|
|
|
|
|
+ print("4. 返回主菜单")
|
|
|
|
|
+
|
|
|
|
|
+ choice = input("\n请输入选项 (1-4): ").strip()
|
|
|
|
|
+
|
|
|
|
|
+ if choice == '1':
|
|
|
|
|
+ self._crawl_by_category()
|
|
|
|
|
+ elif choice == '2':
|
|
|
|
|
+ self._crawl_free_commercial()
|
|
|
|
|
+ elif choice == '3':
|
|
|
|
|
+ self._crawl_latest()
|
|
|
|
|
+ elif choice == '4':
|
|
|
|
|
+ return
|
|
|
|
|
+ else:
|
|
|
|
|
+ print("无效选项")
|
|
|
|
|
+
|
|
|
|
|
+ def _crawl_by_category(self):
|
|
|
|
|
+ print("\n分类选项:")
|
|
|
|
|
+ print("1. 中文字体")
|
|
|
|
|
+ print("2. 英文字体")
|
|
|
|
|
+ print("3. 图形字体")
|
|
|
|
|
+ print("4. 返回")
|
|
|
|
|
+
|
|
|
|
|
+ choice = input("\n请选择分类 (1-4): ").strip()
|
|
|
|
|
+
|
|
|
|
|
+ category_urls = {
|
|
|
|
|
+ '1': '/font-zh.html',
|
|
|
|
|
+ '2': '/font-en.html',
|
|
|
|
|
+ '3': '/font-other.html',
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ if choice in category_urls:
|
|
|
|
|
+ url = self.FONTSNET_BASE + category_urls[choice]
|
|
|
|
|
+ fonts = self._get_fonts_from_list_page(url)
|
|
|
|
|
+ self._process_font_list(fonts)
|
|
|
|
|
+ elif choice == '4':
|
|
|
|
|
+ return
|
|
|
|
|
+ else:
|
|
|
|
|
+ print("无效选项")
|
|
|
|
|
+
|
|
|
|
|
+ def _crawl_free_commercial(self):
|
|
|
|
|
+ print("\n爬取商用免费字体...")
|
|
|
|
|
+ url = self.FONTSNET_BASE + '/font-zh.html'
|
|
|
|
|
+ fonts = self._get_fonts_from_list_page(url, filter_free=True)
|
|
|
|
|
+ self._process_font_list(fonts)
|
|
|
|
|
+
|
|
|
|
|
+ def _crawl_latest(self):
|
|
|
|
|
+ print("\n爬取最新字体...")
|
|
|
|
|
+ url = self.FONTSNET_BASE + '/font-zh.html'
|
|
|
|
|
+ fonts = self._get_fonts_from_list_page(url)
|
|
|
|
|
+ self._process_font_list(fonts[:20])
|
|
|
|
|
+
|
|
|
|
|
+ def _get_fonts_from_list_page(self, url: str, filter_free: bool = False) -> List[Dict]:
|
|
|
|
|
+ fonts = []
|
|
|
|
|
+ try:
|
|
|
|
|
+ print(f"正在访问: {url}")
|
|
|
|
|
+ resp = self.sess.get(url, timeout=30)
|
|
|
|
|
+ resp.encoding = 'utf-8'
|
|
|
|
|
+ soup = BeautifulSoup(resp.text, 'lxml')
|
|
|
|
|
+
|
|
|
|
|
+ font_links = soup.find_all('a', href=re.compile(r'/font-\d+\.html'))
|
|
|
|
|
+
|
|
|
|
|
+ seen_ids = set()
|
|
|
|
|
+ for link in font_links:
|
|
|
|
|
+ href = link.get('href', '')
|
|
|
|
|
+ match = re.search(r'/font-(\d+)\.html', href)
|
|
|
|
|
+ if match:
|
|
|
|
|
+ font_id = match.group(1)
|
|
|
|
|
+ if font_id not in seen_ids:
|
|
|
|
|
+ seen_ids.add(font_id)
|
|
|
|
|
+ font_name = link.get_text(strip=True)
|
|
|
|
|
+ if not font_name:
|
|
|
|
|
+ font_name = f'font_{font_id}'
|
|
|
|
|
+
|
|
|
|
|
+ fonts.append({
|
|
|
|
|
+ 'id': font_id,
|
|
|
|
|
+ 'name': font_name,
|
|
|
|
|
+ 'url': urljoin(self.FONTSNET_BASE, href)
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ print(f"找到 {len(fonts)} 个字体")
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"获取字体列表失败: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ return fonts
|
|
|
|
|
+
|
|
|
|
|
+ def _process_font_list(self, fonts: List[Dict]):
|
|
|
|
|
+ if not fonts:
|
|
|
|
|
+ print("没有找到可下载的字体")
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ print(f"\n找到 {len(fonts)} 个字体:")
|
|
|
|
|
+ for i, font in enumerate(fonts[:10], 1):
|
|
|
|
|
+ print(f" {i}. {font['name']} (ID: {font['id']})")
|
|
|
|
|
+
|
|
|
|
|
+ if len(fonts) > 10:
|
|
|
|
|
+ print(f" ... 还有 {len(fonts) - 10} 个字体")
|
|
|
|
|
+
|
|
|
|
|
+ print("\n操作选项:")
|
|
|
|
|
+ print("1. 下载所有字体")
|
|
|
|
|
+ print("2. 下载指定范围 (例如: 1-5)")
|
|
|
|
|
+ print("3. 输入字体ID下载")
|
|
|
|
|
+ print("4. 返回")
|
|
|
|
|
+
|
|
|
|
|
+ choice = input("\n请选择操作 (1-4): ").strip()
|
|
|
|
|
+
|
|
|
|
|
+ if choice == '1':
|
|
|
|
|
+ for font in tqdm(fonts, desc="下载字体"):
|
|
|
|
|
+ self.download_font(font)
|
|
|
|
|
+ elif choice == '2':
|
|
|
|
|
+ range_str = input("请输入范围 (例如: 1-5): ").strip()
|
|
|
|
|
+ try:
|
|
|
|
|
+ start, end = map(int, range_str.split('-'))
|
|
|
|
|
+ for font in fonts[start-1:end]:
|
|
|
|
|
+ self.download_font(font)
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"输入格式错误: {e}")
|
|
|
|
|
+ elif choice == '3':
|
|
|
|
|
+ font_id = input("请输入字体ID: ").strip()
|
|
|
|
|
+ if font_id:
|
|
|
|
|
+ self.download_font_by_id(font_id)
|
|
|
|
|
+ elif choice == '4':
|
|
|
|
|
+ return
|
|
|
|
|
+ else:
|
|
|
|
|
+ print("无效选项")
|
|
|
|
|
+
|
|
|
|
|
+ def download_font_by_id(self, font_id: str):
|
|
|
|
|
+ font = {
|
|
|
|
|
+ 'id': font_id,
|
|
|
|
|
+ 'name': f'font_{font_id}',
|
|
|
|
|
+ 'url': f'{self.FONTSNET_BASE}/font-{font_id}.html'
|
|
|
|
|
+ }
|
|
|
|
|
+ self.download_font(font)
|
|
|
|
|
+
|
|
|
|
|
+ def download_font(self, font: Dict) -> bool:
|
|
|
|
|
+ print(f"\n正在处理字体: {font['name']} (ID: {font['id']})")
|
|
|
|
|
+
|
|
|
|
|
+ detail_url = font.get('url', '')
|
|
|
|
|
+ if not detail_url:
|
|
|
|
|
+ detail_url = f'{self.FONTSNET_BASE}/font-{font["id"]}.html'
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ download_urls = self._parse_detail_page(detail_url)
|
|
|
|
|
+
|
|
|
|
|
+ if not download_urls:
|
|
|
|
|
+ print(f" 未找到下载链接: {font['name']}")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ success = False
|
|
|
|
|
+ for url_info in download_urls:
|
|
|
|
|
+ download_url = url_info.get('url', '')
|
|
|
|
|
+ download_type = url_info.get('type', 'unknown')
|
|
|
|
|
+
|
|
|
|
|
+ print(f" 尝试下载 ({download_type}): {download_url[:50]}...")
|
|
|
|
|
+
|
|
|
|
|
+ save_path = self._download_file(download_url, font['name'])
|
|
|
|
|
+ if save_path:
|
|
|
|
|
+ print(f" 下载成功: {save_path}")
|
|
|
|
|
+ success = True
|
|
|
|
|
+ break
|
|
|
|
|
+
|
|
|
|
|
+ return success
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" 下载失败: {font['name']}, 错误: {e}")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ def _parse_detail_page(self, url: str) -> List[Dict]:
|
|
|
|
|
+ download_urls = []
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ resp = self.sess.get(url, timeout=30)
|
|
|
|
|
+ resp.encoding = 'utf-8'
|
|
|
|
|
+ soup = BeautifulSoup(resp.text, 'lxml')
|
|
|
|
|
+
|
|
|
|
|
+ download_links = soup.find_all('a', string=re.compile(r'下载|download|Download'))
|
|
|
|
|
+
|
|
|
|
|
+ for link in download_links:
|
|
|
|
|
+ href = link.get('href', '')
|
|
|
|
|
+ if href and not href.startswith('#') and not href.startswith('javascript'):
|
|
|
|
|
+ full_url = urljoin(self.FONTSNET_BASE, href)
|
|
|
|
|
+ text = link.get_text(strip=True)
|
|
|
|
|
+
|
|
|
|
|
+ if '免费' in text or 'free' in text.lower():
|
|
|
|
|
+ download_type = 'free'
|
|
|
|
|
+ elif '官网' in text or 'official' in text.lower():
|
|
|
|
|
+ download_type = 'official'
|
|
|
|
|
+ else:
|
|
|
|
|
+ download_type = 'direct'
|
|
|
|
|
+
|
|
|
|
|
+ download_urls.append({
|
|
|
|
|
+ 'url': full_url,
|
|
|
|
|
+ 'type': download_type,
|
|
|
|
|
+ 'text': text
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ all_links = soup.find_all('a', href=True)
|
|
|
|
|
+ for link in all_links:
|
|
|
|
|
+ href = link.get('href', '')
|
|
|
|
|
+ if re.search(r'\.(zip|rar|7z|ttf|otf|woff)', href, re.I):
|
|
|
|
|
+ full_url = urljoin(self.FONTSNET_BASE, href)
|
|
|
|
|
+ if not any(u['url'] == full_url for u in download_urls):
|
|
|
|
|
+ download_urls.append({
|
|
|
|
|
+ 'url': full_url,
|
|
|
|
|
+ 'type': 'direct_file',
|
|
|
|
|
+ 'text': link.get_text(strip=True)
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ scripts = soup.find_all('script')
|
|
|
|
|
+ for script in scripts:
|
|
|
|
|
+ script_text = script.get_text() if script else ''
|
|
|
|
|
+ if script_text:
|
|
|
|
|
+ url_patterns = [
|
|
|
|
|
+ r'["\'](https?://[^"\']+\.(?:zip|rar|7z|ttf|otf|woff))["\']',
|
|
|
|
|
+ r'["\'](/download/[^"\']+)["\']',
|
|
|
|
|
+ ]
|
|
|
|
|
+ for pattern in url_patterns:
|
|
|
|
|
+ matches = re.findall(pattern, script_text)
|
|
|
|
|
+ for match in matches:
|
|
|
|
|
+ full_url = urljoin(self.FONTSNET_BASE, match)
|
|
|
|
|
+ if not any(u['url'] == full_url for u in download_urls):
|
|
|
|
|
+ download_urls.append({
|
|
|
|
|
+ 'url': full_url,
|
|
|
|
|
+ 'type': 'script_extracted',
|
|
|
|
|
+ 'text': '从脚本提取'
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ print(f" 解析到 {len(download_urls)} 个下载链接")
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" 解析详情页失败: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ return download_urls
|
|
|
|
|
+
|
|
|
|
|
+ def _download_file(self, url: str, font_name: str) -> Optional[str]:
|
|
|
|
|
+ try:
|
|
|
|
|
+ headers = self.header.copy()
|
|
|
|
|
+ headers['Referer'] = self.FONTSNET_BASE
|
|
|
|
|
+
|
|
|
|
|
+ resp = self.sess.get(url, headers=headers, stream=True, timeout=60, allow_redirects=True)
|
|
|
|
|
+
|
|
|
|
|
+ if resp.status_code != 200:
|
|
|
|
|
+ print(f" HTTP状态码: {resp.status_code}")
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ content_type = resp.headers.get('Content-Type', '')
|
|
|
|
|
+ content_disposition = resp.headers.get('Content-Disposition', '')
|
|
|
|
|
+
|
|
|
|
|
+ filename = self._extract_filename(content_disposition, url, font_name)
|
|
|
|
|
+
|
|
|
|
|
+ safe_filename = self._sanitize_filename(filename)
|
|
|
|
|
+ save_path = os.path.join(self.download_dir, safe_filename)
|
|
|
|
|
+
|
|
|
|
|
+ total_size = int(resp.headers.get('Content-Length', 0))
|
|
|
|
|
+
|
|
|
|
|
+ print(f" 保存到: {save_path}")
|
|
|
|
|
+ if total_size > 0:
|
|
|
|
|
+ print(f" 文件大小: {total_size / 1024:.1f} KB")
|
|
|
|
|
+
|
|
|
|
|
+ with open(save_path, 'wb') as f:
|
|
|
|
|
+ if total_size > 0:
|
|
|
|
|
+ with tqdm(total=total_size, unit='B', unit_scale=True, desc=' 下载') as pbar:
|
|
|
|
|
+ for chunk in resp.iter_content(chunk_size=8192):
|
|
|
|
|
+ if chunk:
|
|
|
|
|
+ f.write(chunk)
|
|
|
|
|
+ pbar.update(len(chunk))
|
|
|
|
|
+ else:
|
|
|
|
|
+ for chunk in resp.iter_content(chunk_size=8192):
|
|
|
|
|
+ if chunk:
|
|
|
|
|
+ f.write(chunk)
|
|
|
|
|
+
|
|
|
|
|
+ if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
|
|
|
|
|
+ file_size = os.path.getsize(save_path)
|
|
|
|
|
+ print(f" 下载完成,大小: {file_size / 1024:.1f} KB")
|
|
|
|
|
+
|
|
|
|
|
+ if save_path.lower().endswith('.zip'):
|
|
|
|
|
+ self._extract_zip(save_path)
|
|
|
|
|
+
|
|
|
|
|
+ return save_path
|
|
|
|
|
+ else:
|
|
|
|
|
+ if os.path.exists(save_path):
|
|
|
|
|
+ os.remove(save_path)
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ except requests.exceptions.Timeout:
|
|
|
|
|
+ print(f" 下载超时")
|
|
|
|
|
+ return None
|
|
|
|
|
+ except requests.exceptions.ConnectionError as e:
|
|
|
|
|
+ print(f" 连接错误: {e}")
|
|
|
|
|
+ return None
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" 下载错误: {e}")
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_filename(self, content_disposition: str, url: str, default_name: str) -> str:
|
|
|
|
|
+ if content_disposition:
|
|
|
|
|
+ match = re.search(r'filename[^;=\n]*=((["\']).*?\2|[^;\n]*)', content_disposition)
|
|
|
|
|
+ if match:
|
|
|
|
|
+ filename = match.group(1).strip('"\'')
|
|
|
|
|
+ if filename:
|
|
|
|
|
+ return filename
|
|
|
|
|
+
|
|
|
|
|
+ parsed_url = urlparse(url)
|
|
|
|
|
+ path = parsed_url.path
|
|
|
|
|
+ if path and path != '/':
|
|
|
|
|
+ filename = os.path.basename(path)
|
|
|
|
|
+ if filename and '.' in filename:
|
|
|
|
|
+ return filename
|
|
|
|
|
+
|
|
|
|
|
+ return f"{default_name}.zip"
|
|
|
|
|
+
|
|
|
|
|
+ def _sanitize_filename(self, filename: str) -> str:
|
|
|
|
|
+ invalid_chars = '<>:"/\\|?*'
|
|
|
|
|
+ for char in invalid_chars:
|
|
|
|
|
+ filename = filename.replace(char, '_')
|
|
|
|
|
+
|
|
|
|
|
+ if len(filename) > 200:
|
|
|
|
|
+ name, ext = os.path.splitext(filename)
|
|
|
|
|
+ filename = name[:190] + ext
|
|
|
|
|
+
|
|
|
|
|
+ return filename
|
|
|
|
|
+
|
|
|
|
|
+ def _extract_zip(self, zip_path: str):
|
|
|
|
|
+ try:
|
|
|
|
|
+ extract_dir = os.path.splitext(zip_path)[0]
|
|
|
|
|
+ if not os.path.exists(extract_dir):
|
|
|
|
|
+ os.makedirs(extract_dir)
|
|
|
|
|
+
|
|
|
|
|
+ with zipfile.ZipFile(zip_path, 'r') as zf:
|
|
|
|
|
+ print(f" 解压到: {extract_dir}")
|
|
|
|
|
+ zf.extractall(extract_dir)
|
|
|
|
|
+
|
|
|
|
|
+ font_files = []
|
|
|
|
|
+ for root, dirs, files in os.walk(extract_dir):
|
|
|
|
|
+ for file in files:
|
|
|
|
|
+ if file.lower().endswith(('.ttf', '.otf', '.woff', '.woff2')):
|
|
|
|
|
+ font_files.append(os.path.join(root, file))
|
|
|
|
|
+
|
|
|
|
|
+ if font_files:
|
|
|
|
|
+ print(f" 找到 {len(font_files)} 个字体文件:")
|
|
|
|
|
+ for ff in font_files[:5]:
|
|
|
|
|
+ print(f" - {os.path.basename(ff)}")
|
|
|
|
|
+ if len(font_files) > 5:
|
|
|
|
|
+ print(f" ... 还有 {len(font_files) - 5} 个")
|
|
|
|
|
+
|
|
|
|
|
+ except zipfile.BadZipFile:
|
|
|
|
|
+ print(f" 警告: 不是有效的 ZIP 文件")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" 解压失败: {e}")
|
|
|
|
|
+
|
|
|
|
|
+ def crawl_chinaz(self):
|
|
|
|
|
+ print("\n" + "=" * 50)
|
|
|
|
|
+ print("开始爬取站长字体 (font.chinaz.com)")
|
|
|
|
|
+ print("=" * 50)
|
|
|
|
|
+
|
|
|
|
|
+ print("\n功能开发中...")
|
|
|
|
|
+ print("站长字体网站结构:")
|
|
|
|
|
+ print(" - 首页: https://font.chinaz.com/")
|
|
|
|
|
+ print(" - 分类页面: https://font.chinaz.com/zhongwenziti.html")
|
|
|
|
|
+ print(" - 详情页: https://font.chinaz.com/{font_id}.html")
|
|
|
|
|
+
|
|
|
|
|
+ print("\n请输入要下载的字体详情页URL,或输入 'back' 返回主菜单:")
|
|
|
|
|
+ url = input("URL: ").strip()
|
|
|
|
|
+
|
|
|
|
|
+ if url.lower() == 'back':
|
|
|
|
|
+ return
|
|
|
|
|
+
|
|
|
|
|
+ if url.startswith('http'):
|
|
|
|
|
+ self._download_chinaz_font(url)
|
|
|
|
|
+ else:
|
|
|
|
|
+ print("无效的URL")
|
|
|
|
|
+
|
|
|
|
|
+ def _download_chinaz_font(self, url: str) -> bool:
|
|
|
|
|
+ print(f"\n正在处理: {url}")
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ resp = self.sess.get(url, timeout=30)
|
|
|
|
|
+ resp.encoding = 'utf-8'
|
|
|
|
|
+ soup = BeautifulSoup(resp.text, 'lxml')
|
|
|
|
|
+
|
|
|
|
|
+ font_name = 'unknown_chinaz_font'
|
|
|
|
|
+ title_tag = soup.find('title')
|
|
|
|
|
+ if title_tag:
|
|
|
|
|
+ title_text = title_tag.get_text()
|
|
|
|
|
+ match = re.search(r'([^|_]+)', title_text)
|
|
|
|
|
+ if match:
|
|
|
|
|
+ font_name = match.group(1).strip()
|
|
|
|
|
+
|
|
|
|
|
+ download_urls = []
|
|
|
|
|
+
|
|
|
|
|
+ download_links = soup.find_all('a', href=True)
|
|
|
|
|
+ for link in download_links:
|
|
|
|
|
+ href = link.get('href', '')
|
|
|
|
|
+ text = link.get_text(strip=True)
|
|
|
|
|
+
|
|
|
|
|
+ if re.search(r'下载|download|本地|高速', text, re.I):
|
|
|
|
|
+ if href and not href.startswith('#') and not href.startswith('javascript'):
|
|
|
|
|
+ full_url = urljoin(self.CHINAZ_BASE, href)
|
|
|
|
|
+ download_urls.append({
|
|
|
|
|
+ 'url': full_url,
|
|
|
|
|
+ 'type': 'chinaz_download',
|
|
|
|
|
+ 'text': text
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ for link in download_links:
|
|
|
|
|
+ href = link.get('href', '')
|
|
|
|
|
+ if re.search(r'\.(zip|rar|7z|ttf|otf)', href, re.I):
|
|
|
|
|
+ full_url = urljoin(self.CHINAZ_BASE, href)
|
|
|
|
|
+ if not any(u['url'] == full_url for u in download_urls):
|
|
|
|
|
+ download_urls.append({
|
|
|
|
|
+ 'url': full_url,
|
|
|
|
|
+ 'type': 'direct_file',
|
|
|
|
|
+ 'text': link.get_text(strip=True)
|
|
|
|
|
+ })
|
|
|
|
|
+
|
|
|
|
|
+ print(f" 解析到 {len(download_urls)} 个下载链接")
|
|
|
|
|
+
|
|
|
|
|
+ if download_urls:
|
|
|
|
|
+ font = {
|
|
|
|
|
+ 'id': 'chinaz_' + str(int(time.time())),
|
|
|
|
|
+ 'name': font_name,
|
|
|
|
|
+ 'url': url
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ for url_info in download_urls:
|
|
|
|
|
+ print(f" 尝试下载: {url_info['url'][:60]}...")
|
|
|
|
|
+ save_path = self._download_file(url_info['url'], font_name)
|
|
|
|
|
+ if save_path:
|
|
|
|
|
+ print(f" 下载成功: {save_path}")
|
|
|
|
|
+ return True
|
|
|
|
|
+
|
|
|
|
|
+ print(" 未找到可下载的链接")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" 处理失败: {e}")
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ def download(self, url: str, save_path: str = None) -> Optional[str]:
|
|
|
|
|
+ print(f"\n下载: {url}")
|
|
|
|
|
+
|
|
|
|
|
+ try:
|
|
|
|
|
+ resp = self.sess.get(url, stream=True, timeout=60)
|
|
|
|
|
+
|
|
|
|
|
+ if resp.status_code != 200:
|
|
|
|
|
+ print(f" HTTP状态码: {resp.status_code}")
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ if not save_path:
|
|
|
|
|
+ content_disposition = resp.headers.get('Content-Disposition', '')
|
|
|
|
|
+ save_path = self._extract_filename(content_disposition, url, 'downloaded_font')
|
|
|
|
|
+ save_path = os.path.join(self.download_dir, self._sanitize_filename(save_path))
|
|
|
|
|
+
|
|
|
|
|
+ total_size = int(resp.headers.get('Content-Length', 0))
|
|
|
|
|
+
|
|
|
|
|
+ with open(save_path, 'wb') as f:
|
|
|
|
|
+ if total_size > 0:
|
|
|
|
|
+ with tqdm(total=total_size, unit='B', unit_scale=True, desc=' 下载') as pbar:
|
|
|
|
|
+ for chunk in resp.iter_content(chunk_size=8192):
|
|
|
|
|
+ if chunk:
|
|
|
|
|
+ f.write(chunk)
|
|
|
|
|
+ pbar.update(len(chunk))
|
|
|
|
|
+ else:
|
|
|
|
|
+ for chunk in resp.iter_content(chunk_size=8192):
|
|
|
|
|
+ if chunk:
|
|
|
|
|
+ f.write(chunk)
|
|
|
|
|
+
|
|
|
|
|
+ print(f" 保存到: {save_path}")
|
|
|
|
|
+ return save_path
|
|
|
|
|
+
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f" 下载失败: {e}")
|
|
|
|
|
+ return None
|
|
|
|
|
|
|
|
|
|
|
|
|
- pass
|
|
|
|
|
|
|
+def main():
|
|
|
|
|
+ font_downloader = Font()
|
|
|
|
|
+ font_downloader.run()
|
|
|
|
|
|
|
|
- def crawl_chinaz(self):
|
|
|
|
|
- pass
|
|
|
|
|
-
|
|
|
|
|
- def download(self):
|
|
|
|
|
- pass
|
|
|
|
|
|
|
|
|
|
-if __name__=='__main__':
|
|
|
|
|
- pass
|
|
|
|
|
|
|
+if __name__ == '__main__':
|
|
|
|
|
+ main()
|