11 months ago · d839912a33
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
 
				+download/
			
 
				+__pycache__/
			
 
				+*.pyc
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,24 @@
 
				+# crawl_yunque
			
 
				+语雀爬虫 可以保存整个语雀知识库为Markdown格式 (包含完整目录结构和索引) 
			
 
				+
			
 
				+## Usage
			
 
				+
			
 
				+复制文档url，执行如下命令：
			
 
				+```
			
 
				+python main.py -url https://www.yuque.com/burpheart/phpaudit
			
 
				+```
			
 
				+
			
 
				+
			
 
				+## 源码分析
			
 
				+
			
 
				+运行 main.py，获取url参数调用requests获取源码，查找如下网页源码：
			
 
				+```
			
 
				+<script nonce=wJM6HFxGFWlvqbg5UT1h>
			
 
				+(function() {
			
 
				+  window.appData = JSON.parse(decodeURIComponent("%7B%22me%22%3A%7B%xxxx7D"));
			
 
				+})();
			
 
				+</script>
			
 
				+```
			
 
				+
			
 
				+可以发现，云雀将内容存储在window.appData中，我们只需要将其转换为json格式，即可获取到所有的文章内容。
			
 
				+
			
--- a/gui.py
+++ b/gui.py
@@ -0,0 +1,11 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2023/05/20 10:22:22
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   gui
			
 
				+'''
			
 
				+
			
 
				+if __name__=='__main__':
			
 
				+    pass
			
--- a/main.py
+++ b/main.py
@@ -0,0 +1,13 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2023/05/20 09:33:53
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   enter point
			
 
				+'''
			
 
				+
			
 
				+from yuque import main
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    main()
			
--- a/main.ui
+++ b/main.ui
@@ -0,0 +1,100 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<ui version="4.0">
			
 
				+ <class>MainWindow</class>
			
 
				+ <widget class="QMainWindow" name="MainWindow">
			
 
				+  <property name="geometry">
			
 
				+   <rect>
			
 
				+    <x>0</x>
			
 
				+    <y>0</y>
			
 
				+    <width>563</width>
			
 
				+    <height>319</height>
			
 
				+   </rect>
			
 
				+  </property>
			
 
				+  <property name="windowTitle">
			
 
				+   <string>云雀知识库批量下载工具</string>
			
 
				+  </property>
			
 
				+  <widget class="QWidget" name="centralwidget">
			
 
				+   <widget class="QLineEdit" name="lineEdit">
			
 
				+    <property name="geometry">
			
 
				+     <rect>
			
 
				+      <x>190</x>
			
 
				+      <y>100</y>
			
 
				+      <width>321</width>
			
 
				+      <height>20</height>
			
 
				+     </rect>
			
 
				+    </property>
			
 
				+   </widget>
			
 
				+   <widget class="QPushButton" name="pushButton">
			
 
				+    <property name="geometry">
			
 
				+     <rect>
			
 
				+      <x>190</x>
			
 
				+      <y>190</y>
			
 
				+      <width>75</width>
			
 
				+      <height>23</height>
			
 
				+     </rect>
			
 
				+    </property>
			
 
				+    <property name="text">
			
 
				+     <string>开始下载</string>
			
 
				+    </property>
			
 
				+   </widget>
			
 
				+   <widget class="QLabel" name="label">
			
 
				+    <property name="geometry">
			
 
				+     <rect>
			
 
				+      <x>140</x>
			
 
				+      <y>100</y>
			
 
				+      <width>54</width>
			
 
				+      <height>12</height>
			
 
				+     </rect>
			
 
				+    </property>
			
 
				+    <property name="text">
			
 
				+     <string>链接</string>
			
 
				+    </property>
			
 
				+   </widget>
			
 
				+   <widget class="QLabel" name="label_2">
			
 
				+    <property name="geometry">
			
 
				+     <rect>
			
 
				+      <x>140</x>
			
 
				+      <y>20</y>
			
 
				+      <width>251</width>
			
 
				+      <height>51</height>
			
 
				+     </rect>
			
 
				+    </property>
			
 
				+    <property name="font">
			
 
				+     <font>
			
 
				+      <family>Algerian</family>
			
 
				+      <pointsize>16</pointsize>
			
 
				+     </font>
			
 
				+    </property>
			
 
				+    <property name="text">
			
 
				+     <string>云雀知识库批量下载工具</string>
			
 
				+    </property>
			
 
				+   </widget>
			
 
				+   <widget class="QLabel" name="label_3">
			
 
				+    <property name="geometry">
			
 
				+     <rect>
			
 
				+      <x>120</x>
			
 
				+      <y>140</y>
			
 
				+      <width>54</width>
			
 
				+      <height>12</height>
			
 
				+     </rect>
			
 
				+    </property>
			
 
				+    <property name="text">
			
 
				+     <string>保存路径：</string>
			
 
				+    </property>
			
 
				+   </widget>
			
 
				+   <widget class="QLineEdit" name="lineEdit_2">
			
 
				+    <property name="geometry">
			
 
				+     <rect>
			
 
				+      <x>190</x>
			
 
				+      <y>140</y>
			
 
				+      <width>321</width>
			
 
				+      <height>20</height>
			
 
				+     </rect>
			
 
				+    </property>
			
 
				+   </widget>
			
 
				+  </widget>
			
 
				+  <widget class="QStatusBar" name="statusbar"/>
			
 
				+ </widget>
			
 
				+ <resources/>
			
 
				+ <connections/>
			
 
				+</ui>
			
--- a/yuque/__init__.py
+++ b/yuque/__init__.py
@@ -0,0 +1,14 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2023/05/20 09:34:58
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   
			
 
				+'''
			
 
				+
			
 
				+from yuque.yunque import YunQue
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    YunQue().run()
			
--- a/yuque/api.py
+++ b/yuque/api.py
@@ -0,0 +1,3 @@
 
				+
			
 
				+host="https://www.yuque.com"    
			
 
				+docs=host + "/api/docs/"
			
--- a/yuque/yunque.py
+++ b/yuque/yunque.py
@@ -0,0 +1,115 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2023/05/20 09:34:22
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   
			
 
				+'''
			
 
				+import sys
			
 
				+
			
 
				+import requests
			
 
				+import json
			
 
				+import re
			
 
				+import os
			
 
				+import urllib.parse
			
 
				+import logging
			
 
				+import argparse
			
 
				+from yuque import api
			
 
				+
			
 
				+class YunQue(object):
			
 
				+    ''' 语雀知识库下载 '''
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        self.sess=requests.Session()
			
 
				+        self.logger = logging.getLogger(__name__)
			
 
				+        self.logger.setLevel(logging.DEBUG)
			
 
				+        self.formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+        self.ch = logging.StreamHandler()
			
 
				+        self.ch.setLevel(logging.DEBUG)
			
 
				+        self.ch.setFormatter(self.formatter)
			
 
				+        self.logger.addHandler(self.ch)
			
 
				+        self.args = None
			
 
				+        self.parser = argparse.ArgumentParser(description='yuque download')
			
 
				+        self.parser.add_argument('-url', '--url', help='url', default='')
			
 
				+        self.args = self.parser.parse_args()
			
 
				+
			
 
				+    def save_page(self, book_id, sulg, path):
			
 
				+        ''' 保存文档 '''
			
 
				+        docsdata = requests.get(api.docs + sulg + '?book_id=' + book_id + '&merge_dynamic_data=false&mode=markdown')
			
 
				+        if (docsdata.status_code != 200):
			
 
				+            print("文档下载失败 页面可能被删除 ", book_id, sulg, docsdata.content)
			
 
				+            return
			
 
				+        docsjson = json.loads(docsdata.content)
			
 
				+
			
 
				+        with open(path, 'w', encoding='utf-8') as f:
			
 
				+            f.write(docsjson['data']['sourcecode'])
			
 
				+
			
 
				+    def get_book(self, url):
			
 
				+        ''' 获取知识库 '''
			
 
				+        try:
			
 
				+            docsdata = requests.get(url)
			
 
				+            data = re.findall(r"decodeURIComponent\(\"(.+)\"\)\);", docsdata.content.decode('utf-8'))
			
 
				+            docsjson = json.loads(urllib.parse.unquote(data[0]))
			
 
				+        except Exception as e:
			
 
				+            self.logger.error("检查文档链接是否错误，"+e)
			
 
				+        test = []
			
 
				+        list = {}
			
 
				+        temp = {}
			
 
				+        md = ""
			
 
				+        table = str.maketrans('\/:*?"<>|' + "\n\r", "___________")
			
 
				+        prename = ""
			
 
				+        if (os.path.exists("download/" + str(docsjson['book']['id'])) == False):
			
 
				+            os.makedirs("download/" + str(docsjson['book']['id']))
			
 
				+        # 遍历文档
			
 
				+        for doc in docsjson['book']['toc']:
			
 
				+            # 创建目录
			
 
				+            if (doc['type'] == 'TITLE'):
			
 
				+                filename = ''
			
 
				+                list[doc['uuid']] = {'0': doc['title'], '1': doc['parent_uuid']}
			
 
				+                uuid = doc['uuid']
			
 
				+                temp[doc['uuid']] = ''
			
 
				+                while True:
			
 
				+                    if (list[uuid]['1'] != ''):
			
 
				+                        if temp[doc['uuid']] == '':
			
 
				+                            temp[doc['uuid']] = doc['title'].translate(table)
			
 
				+                        else:
			
 
				+                            temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
			
 
				+                        uuid = list[uuid]['1']
			
 
				+                    else:
			
 
				+                        temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
			
 
				+                        break
			
 
				+                if ((os.path.exists("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])) == False):
			
 
				+                    os.makedirs("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])
			
 
				+                if (temp[doc['uuid']].endswith("/")):
			
 
				+                    md += "## " + temp[doc['uuid']][:-1] + "\n"
			
 
				+                else:
			
 
				+                    md += "  " * (temp[doc['uuid']].count("/") - 1) + "* " + temp[doc['uuid']][
			
 
				+                                                                            temp[doc['uuid']].rfind("/") + 1:] + "\n"
			
 
				+            if (doc['url'] != ''):
			
 
				+                if doc['parent_uuid'] != "":
			
 
				+                    if (temp[doc['parent_uuid']].endswith("/")):
			
 
				+                        md += " " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
			
 
				+                            temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
			
 
				+                    else:
			
 
				+                        md += "  " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
			
 
				+                            temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
			
 
				+                    self.save_page(str(docsjson['book']['id']), doc['url'],
			
 
				+                            "download/" + str(docsjson['book']['id']) + '/' + temp[doc['parent_uuid']] + "/" + doc[
			
 
				+                                'title'].translate(table) + '.md')
			
 
				+                else:
			
 
				+                    md += " " + "* [" + doc['title'] + "](" + urllib.parse.quote(
			
 
				+                        doc['title'].translate(table) + '.md') + ")" + "\n"
			
 
				+                    self.save_page(str(docsjson['book']['id']), doc['url'],
			
 
				+                            "download/" + str(docsjson['book']['id']) + "/" + doc[
			
 
				+                                'title'].translate(table) + '.md')
			
 
				+        with open("download/" + str(docsjson['book']['id']) + '/' + "/SUMMARY.md", 'w', encoding='utf-8') as f:
			
 
				+            f.write(md)
			
 
				+
			
 
				+    def run(self):
			
 
				+        ''' 获取文档 '''
			
 
				+        if(self.args.url != ''):
			
 
				+            url = self.args.url
			
 
				+            self.get_book(url)
			
 
				+        else:
			
 
				+            url = input("请输入语雀文档链接：")