Browse Source

feat(font): 实现字体下载器核心功能并添加依赖

添加requests、beautifulsoup4、lxml和tqdm依赖
实现字体天下和站长字体网站的爬取功能
支持按分类、商用免费和最新字体下载
添加文件下载、解压和进度显示功能
完善用户交互界面和错误处理
liuyuqi-cnb 4 days ago
parent
commit
3727411230

+ 26 - 0
crawl_font.egg-info/PKG-INFO

@@ -0,0 +1,26 @@
+Metadata-Version: 2.4
+Name: crawl-font
+Version: 0.1.0
+Summary: Font downloader for fonts.net.cn and other font websites
+Requires-Python: >=3.12
+Description-Content-Type: text/markdown
+Requires-Dist: requests>=2.32.5
+Requires-Dist: beautifulsoup4>=4.12.0
+Requires-Dist: lxml>=5.0.0
+Requires-Dist: tqdm>=4.66.0
+
+# crawl_font
+
+font download tool
+
+## Develop
+
+```
+uv sync
+uv run main.py
+```
+
+## License
+
+Licensed under the [Apache 2.0](LICENSE) © [liuyuqi.gov@msn.cn](https://github.com/jianboy)
+

+ 1 - 0
crawl_font.egg-info/dependency_links.txt

@@ -0,0 +1 @@
+

+ 4 - 0
crawl_font.egg-info/requires.txt

@@ -0,0 +1,4 @@
+requests>=2.32.5
+beautifulsoup4>=4.12.0
+lxml>=5.0.0
+tqdm>=4.66.0

+ 537 - 20
crawl_font/font.py

@@ -4,38 +4,555 @@
 @Contact :   liuyuqi.gov@msn.cn
 @Time    :   2024/07/30 19:02:15
 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
-@Desc    :   
+@Desc    :   Font downloader for fonts.net.cn and chinaz.com
 '''
+import os
+import re
+import time
+import zipfile
 import requests
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from tqdm import tqdm
+from typing import List, Dict, Optional, Tuple
+
 
 class Font(object):
-    """docstring for Font"""
+    """Font downloader class"""
+    
     header = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36'
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
     }
-    def __init__(self):
+    
+    FONTSNET_BASE = 'https://www.fonts.net.cn'
+    CHINAZ_BASE = 'https://font.chinaz.com'
+    
+    def __init__(self, download_dir: str = './fonts'):
         self.sess = requests.Session()
         self.sess.headers.update(self.header)
+        self.download_dir = download_dir
+        self._ensure_download_dir()
         
+    def _ensure_download_dir(self):
+        if not os.path.exists(self.download_dir):
+            os.makedirs(self.download_dir)
+    
     def run(self):
-        pass
-
+        print("=" * 50)
+        print("字体下载工具")
+        print("=" * 50)
+        print("\n请选择要爬取的网站:")
+        print("1. 字体天下 (fonts.net.cn) - 商用免费字体")
+        print("2. 站长字体 (font.chinaz.com)")
+        print("3. 下载指定字体ID")
+        print("4. 退出")
+        
+        choice = input("\n请输入选项 (1-4): ").strip()
+        
+        if choice == '1':
+            self.crawl_fontsnet()
+        elif choice == '2':
+            self.crawl_chinaz()
+        elif choice == '3':
+            font_id = input("请输入字体ID (例如: 37476120124): ").strip()
+            if font_id:
+                self.download_font_by_id(font_id)
+        elif choice == '4':
+            print("退出程序")
+            return
+        else:
+            print("无效选项,退出程序")
+    
     def crawl_fontsnet(self):
-        # download 532 商用字体
-        self.sess.headers.update(
-            {
-                "Origin": "https://www.fonts.net.cn/"
-            }
-        )
+        print("\n" + "=" * 50)
+        print("开始爬取字体天下 (fonts.net.cn)")
+        print("=" * 50)
+        
+        print("\n请选择爬取方式:")
+        print("1. 按分类爬取 (中文字体/英文字体/图形字体)")
+        print("2. 爬取商用免费字体")
+        print("3. 爬取最新字体")
+        print("4. 返回主菜单")
+        
+        choice = input("\n请输入选项 (1-4): ").strip()
+        
+        if choice == '1':
+            self._crawl_by_category()
+        elif choice == '2':
+            self._crawl_free_commercial()
+        elif choice == '3':
+            self._crawl_latest()
+        elif choice == '4':
+            return
+        else:
+            print("无效选项")
+    
+    def _crawl_by_category(self):
+        print("\n分类选项:")
+        print("1. 中文字体")
+        print("2. 英文字体")
+        print("3. 图形字体")
+        print("4. 返回")
+        
+        choice = input("\n请选择分类 (1-4): ").strip()
+        
+        category_urls = {
+            '1': '/font-zh.html',
+            '2': '/font-en.html',
+            '3': '/font-other.html',
+        }
+        
+        if choice in category_urls:
+            url = self.FONTSNET_BASE + category_urls[choice]
+            fonts = self._get_fonts_from_list_page(url)
+            self._process_font_list(fonts)
+        elif choice == '4':
+            return
+        else:
+            print("无效选项")
+    
+    def _crawl_free_commercial(self):
+        print("\n爬取商用免费字体...")
+        url = self.FONTSNET_BASE + '/font-zh.html'
+        fonts = self._get_fonts_from_list_page(url, filter_free=True)
+        self._process_font_list(fonts)
+    
+    def _crawl_latest(self):
+        print("\n爬取最新字体...")
+        url = self.FONTSNET_BASE + '/font-zh.html'
+        fonts = self._get_fonts_from_list_page(url)
+        self._process_font_list(fonts[:20])
+    
+    def _get_fonts_from_list_page(self, url: str, filter_free: bool = False) -> List[Dict]:
+        fonts = []
+        try:
+            print(f"正在访问: {url}")
+            resp = self.sess.get(url, timeout=30)
+            resp.encoding = 'utf-8'
+            soup = BeautifulSoup(resp.text, 'lxml')
+            
+            font_links = soup.find_all('a', href=re.compile(r'/font-\d+\.html'))
+            
+            seen_ids = set()
+            for link in font_links:
+                href = link.get('href', '')
+                match = re.search(r'/font-(\d+)\.html', href)
+                if match:
+                    font_id = match.group(1)
+                    if font_id not in seen_ids:
+                        seen_ids.add(font_id)
+                        font_name = link.get_text(strip=True)
+                        if not font_name:
+                            font_name = f'font_{font_id}'
+                        
+                        fonts.append({
+                            'id': font_id,
+                            'name': font_name,
+                            'url': urljoin(self.FONTSNET_BASE, href)
+                        })
+            
+            print(f"找到 {len(fonts)} 个字体")
+            
+        except Exception as e:
+            print(f"获取字体列表失败: {e}")
+        
+        return fonts
+    
+    def _process_font_list(self, fonts: List[Dict]):
+        if not fonts:
+            print("没有找到可下载的字体")
+            return
+        
+        print(f"\n找到 {len(fonts)} 个字体:")
+        for i, font in enumerate(fonts[:10], 1):
+            print(f"  {i}. {font['name']} (ID: {font['id']})")
+        
+        if len(fonts) > 10:
+            print(f"  ... 还有 {len(fonts) - 10} 个字体")
+        
+        print("\n操作选项:")
+        print("1. 下载所有字体")
+        print("2. 下载指定范围 (例如: 1-5)")
+        print("3. 输入字体ID下载")
+        print("4. 返回")
+        
+        choice = input("\n请选择操作 (1-4): ").strip()
+        
+        if choice == '1':
+            for font in tqdm(fonts, desc="下载字体"):
+                self.download_font(font)
+        elif choice == '2':
+            range_str = input("请输入范围 (例如: 1-5): ").strip()
+            try:
+                start, end = map(int, range_str.split('-'))
+                for font in fonts[start-1:end]:
+                    self.download_font(font)
+            except Exception as e:
+                print(f"输入格式错误: {e}")
+        elif choice == '3':
+            font_id = input("请输入字体ID: ").strip()
+            if font_id:
+                self.download_font_by_id(font_id)
+        elif choice == '4':
+            return
+        else:
+            print("无效选项")
+    
+    def download_font_by_id(self, font_id: str):
+        font = {
+            'id': font_id,
+            'name': f'font_{font_id}',
+            'url': f'{self.FONTSNET_BASE}/font-{font_id}.html'
+        }
+        self.download_font(font)
+    
+    def download_font(self, font: Dict) -> bool:
+        print(f"\n正在处理字体: {font['name']} (ID: {font['id']})")
+        
+        detail_url = font.get('url', '')
+        if not detail_url:
+            detail_url = f'{self.FONTSNET_BASE}/font-{font["id"]}.html'
+        
+        try:
+            download_urls = self._parse_detail_page(detail_url)
+            
+            if not download_urls:
+                print(f"  未找到下载链接: {font['name']}")
+                return False
+            
+            success = False
+            for url_info in download_urls:
+                download_url = url_info.get('url', '')
+                download_type = url_info.get('type', 'unknown')
+                
+                print(f"  尝试下载 ({download_type}): {download_url[:50]}...")
+                
+                save_path = self._download_file(download_url, font['name'])
+                if save_path:
+                    print(f"  下载成功: {save_path}")
+                    success = True
+                    break
+            
+            return success
+            
+        except Exception as e:
+            print(f"  下载失败: {font['name']}, 错误: {e}")
+            return False
+    
+    def _parse_detail_page(self, url: str) -> List[Dict]:
+        download_urls = []
+        
+        try:
+            resp = self.sess.get(url, timeout=30)
+            resp.encoding = 'utf-8'
+            soup = BeautifulSoup(resp.text, 'lxml')
+            
+            download_links = soup.find_all('a', string=re.compile(r'下载|download|Download'))
+            
+            for link in download_links:
+                href = link.get('href', '')
+                if href and not href.startswith('#') and not href.startswith('javascript'):
+                    full_url = urljoin(self.FONTSNET_BASE, href)
+                    text = link.get_text(strip=True)
+                    
+                    if '免费' in text or 'free' in text.lower():
+                        download_type = 'free'
+                    elif '官网' in text or 'official' in text.lower():
+                        download_type = 'official'
+                    else:
+                        download_type = 'direct'
+                    
+                    download_urls.append({
+                        'url': full_url,
+                        'type': download_type,
+                        'text': text
+                    })
+            
+            all_links = soup.find_all('a', href=True)
+            for link in all_links:
+                href = link.get('href', '')
+                if re.search(r'\.(zip|rar|7z|ttf|otf|woff)', href, re.I):
+                    full_url = urljoin(self.FONTSNET_BASE, href)
+                    if not any(u['url'] == full_url for u in download_urls):
+                        download_urls.append({
+                            'url': full_url,
+                            'type': 'direct_file',
+                            'text': link.get_text(strip=True)
+                        })
+            
+            scripts = soup.find_all('script')
+            for script in scripts:
+                script_text = script.get_text() if script else ''
+                if script_text:
+                    url_patterns = [
+                        r'["\'](https?://[^"\']+\.(?:zip|rar|7z|ttf|otf|woff))["\']',
+                        r'["\'](/download/[^"\']+)["\']',
+                    ]
+                    for pattern in url_patterns:
+                        matches = re.findall(pattern, script_text)
+                        for match in matches:
+                            full_url = urljoin(self.FONTSNET_BASE, match)
+                            if not any(u['url'] == full_url for u in download_urls):
+                                download_urls.append({
+                                    'url': full_url,
+                                    'type': 'script_extracted',
+                                    'text': '从脚本提取'
+                                })
+            
+            print(f"  解析到 {len(download_urls)} 个下载链接")
+            
+        except Exception as e:
+            print(f"  解析详情页失败: {e}")
+        
+        return download_urls
+    
+    def _download_file(self, url: str, font_name: str) -> Optional[str]:
+        try:
+            headers = self.header.copy()
+            headers['Referer'] = self.FONTSNET_BASE
+            
+            resp = self.sess.get(url, headers=headers, stream=True, timeout=60, allow_redirects=True)
+            
+            if resp.status_code != 200:
+                print(f"    HTTP状态码: {resp.status_code}")
+                return None
+            
+            content_type = resp.headers.get('Content-Type', '')
+            content_disposition = resp.headers.get('Content-Disposition', '')
+            
+            filename = self._extract_filename(content_disposition, url, font_name)
+            
+            safe_filename = self._sanitize_filename(filename)
+            save_path = os.path.join(self.download_dir, safe_filename)
+            
+            total_size = int(resp.headers.get('Content-Length', 0))
+            
+            print(f"    保存到: {save_path}")
+            if total_size > 0:
+                print(f"    文件大小: {total_size / 1024:.1f} KB")
+            
+            with open(save_path, 'wb') as f:
+                if total_size > 0:
+                    with tqdm(total=total_size, unit='B', unit_scale=True, desc='    下载') as pbar:
+                        for chunk in resp.iter_content(chunk_size=8192):
+                            if chunk:
+                                f.write(chunk)
+                                pbar.update(len(chunk))
+                else:
+                    for chunk in resp.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
+            
+            if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+                file_size = os.path.getsize(save_path)
+                print(f"    下载完成,大小: {file_size / 1024:.1f} KB")
+                
+                if save_path.lower().endswith('.zip'):
+                    self._extract_zip(save_path)
+                
+                return save_path
+            else:
+                if os.path.exists(save_path):
+                    os.remove(save_path)
+                return None
+                
+        except requests.exceptions.Timeout:
+            print(f"    下载超时")
+            return None
+        except requests.exceptions.ConnectionError as e:
+            print(f"    连接错误: {e}")
+            return None
+        except Exception as e:
+            print(f"    下载错误: {e}")
+            return None
+    
+    def _extract_filename(self, content_disposition: str, url: str, default_name: str) -> str:
+        if content_disposition:
+            match = re.search(r'filename[^;=\n]*=((["\']).*?\2|[^;\n]*)', content_disposition)
+            if match:
+                filename = match.group(1).strip('"\'')
+                if filename:
+                    return filename
+        
+        parsed_url = urlparse(url)
+        path = parsed_url.path
+        if path and path != '/':
+            filename = os.path.basename(path)
+            if filename and '.' in filename:
+                return filename
+        
+        return f"{default_name}.zip"
+    
+    def _sanitize_filename(self, filename: str) -> str:
+        invalid_chars = '<>:"/\\|?*'
+        for char in invalid_chars:
+            filename = filename.replace(char, '_')
+        
+        if len(filename) > 200:
+            name, ext = os.path.splitext(filename)
+            filename = name[:190] + ext
+        
+        return filename
+    
+    def _extract_zip(self, zip_path: str):
+        try:
+            extract_dir = os.path.splitext(zip_path)[0]
+            if not os.path.exists(extract_dir):
+                os.makedirs(extract_dir)
+            
+            with zipfile.ZipFile(zip_path, 'r') as zf:
+                print(f"    解压到: {extract_dir}")
+                zf.extractall(extract_dir)
+                
+                font_files = []
+                for root, dirs, files in os.walk(extract_dir):
+                    for file in files:
+                        if file.lower().endswith(('.ttf', '.otf', '.woff', '.woff2')):
+                            font_files.append(os.path.join(root, file))
+                
+                if font_files:
+                    print(f"    找到 {len(font_files)} 个字体文件:")
+                    for ff in font_files[:5]:
+                        print(f"      - {os.path.basename(ff)}")
+                    if len(font_files) > 5:
+                        print(f"      ... 还有 {len(font_files) - 5} 个")
+            
+        except zipfile.BadZipFile:
+            print(f"    警告: 不是有效的 ZIP 文件")
+        except Exception as e:
+            print(f"    解压失败: {e}")
+    
+    def crawl_chinaz(self):
+        print("\n" + "=" * 50)
+        print("开始爬取站长字体 (font.chinaz.com)")
+        print("=" * 50)
+        
+        print("\n功能开发中...")
+        print("站长字体网站结构:")
+        print("  - 首页: https://font.chinaz.com/")
+        print("  - 分类页面: https://font.chinaz.com/zhongwenziti.html")
+        print("  - 详情页: https://font.chinaz.com/{font_id}.html")
+        
+        print("\n请输入要下载的字体详情页URL,或输入 'back' 返回主菜单:")
+        url = input("URL: ").strip()
+        
+        if url.lower() == 'back':
+            return
+        
+        if url.startswith('http'):
+            self._download_chinaz_font(url)
+        else:
+            print("无效的URL")
+    
+    def _download_chinaz_font(self, url: str) -> bool:
+        print(f"\n正在处理: {url}")
+        
+        try:
+            resp = self.sess.get(url, timeout=30)
+            resp.encoding = 'utf-8'
+            soup = BeautifulSoup(resp.text, 'lxml')
+            
+            font_name = 'unknown_chinaz_font'
+            title_tag = soup.find('title')
+            if title_tag:
+                title_text = title_tag.get_text()
+                match = re.search(r'([^|_]+)', title_text)
+                if match:
+                    font_name = match.group(1).strip()
+            
+            download_urls = []
+            
+            download_links = soup.find_all('a', href=True)
+            for link in download_links:
+                href = link.get('href', '')
+                text = link.get_text(strip=True)
+                
+                if re.search(r'下载|download|本地|高速', text, re.I):
+                    if href and not href.startswith('#') and not href.startswith('javascript'):
+                        full_url = urljoin(self.CHINAZ_BASE, href)
+                        download_urls.append({
+                            'url': full_url,
+                            'type': 'chinaz_download',
+                            'text': text
+                        })
+            
+            for link in download_links:
+                href = link.get('href', '')
+                if re.search(r'\.(zip|rar|7z|ttf|otf)', href, re.I):
+                    full_url = urljoin(self.CHINAZ_BASE, href)
+                    if not any(u['url'] == full_url for u in download_urls):
+                        download_urls.append({
+                            'url': full_url,
+                            'type': 'direct_file',
+                            'text': link.get_text(strip=True)
+                        })
+            
+            print(f"  解析到 {len(download_urls)} 个下载链接")
+            
+            if download_urls:
+                font = {
+                    'id': 'chinaz_' + str(int(time.time())),
+                    'name': font_name,
+                    'url': url
+                }
+                
+                for url_info in download_urls:
+                    print(f"  尝试下载: {url_info['url'][:60]}...")
+                    save_path = self._download_file(url_info['url'], font_name)
+                    if save_path:
+                        print(f"  下载成功: {save_path}")
+                        return True
+            
+            print("  未找到可下载的链接")
+            return False
+            
+        except Exception as e:
+            print(f"  处理失败: {e}")
+            return False
+    
+    def download(self, url: str, save_path: str = None) -> Optional[str]:
+        print(f"\n下载: {url}")
+        
+        try:
+            resp = self.sess.get(url, stream=True, timeout=60)
+            
+            if resp.status_code != 200:
+                print(f"  HTTP状态码: {resp.status_code}")
+                return None
+            
+            if not save_path:
+                content_disposition = resp.headers.get('Content-Disposition', '')
+                save_path = self._extract_filename(content_disposition, url, 'downloaded_font')
+                save_path = os.path.join(self.download_dir, self._sanitize_filename(save_path))
+            
+            total_size = int(resp.headers.get('Content-Length', 0))
+            
+            with open(save_path, 'wb') as f:
+                if total_size > 0:
+                    with tqdm(total=total_size, unit='B', unit_scale=True, desc='  下载') as pbar:
+                        for chunk in resp.iter_content(chunk_size=8192):
+                            if chunk:
+                                f.write(chunk)
+                                pbar.update(len(chunk))
+                else:
+                    for chunk in resp.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
+            
+            print(f"  保存到: {save_path}")
+            return save_path
+            
+        except Exception as e:
+            print(f"  下载失败: {e}")
+            return None
 
 
-        pass
+def main():
+    font_downloader = Font()
+    font_downloader.run()
 
-    def crawl_chinaz(self):
-        pass
-    
-    def download(self):
-        pass
 
-if __name__=='__main__':
-    pass
+if __name__ == '__main__':
+    main()

+ 4 - 1
pyproject.toml

@@ -1,9 +1,12 @@
 [project]
 name = "crawl-font"
 version = "0.1.0"
-description = "Add your description here"
+description = "Font downloader for fonts.net.cn and other font websites"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
     "requests>=2.32.5",
+    "beautifulsoup4>=4.12.0",
+    "lxml>=5.0.0",
+    "tqdm>=4.66.0",
 ]