liuyuqi-dellpc 11 months ago
commit
d839912a33
8 changed files with 283 additions and 0 deletions
  1. 3 0
      .gitignore
  2. 24 0
      README.md
  3. 11 0
      gui.py
  4. 13 0
      main.py
  5. 100 0
      main.ui
  6. 14 0
      yuque/__init__.py
  7. 3 0
      yuque/api.py
  8. 115 0
      yuque/yunque.py

+ 3 - 0
.gitignore

@@ -0,0 +1,3 @@
+download/
+__pycache__/
+*.pyc

+ 24 - 0
README.md

@@ -0,0 +1,24 @@
+# crawl_yunque
+语雀爬虫 可以保存整个语雀知识库为Markdown格式 (包含完整目录结构和索引) 
+
+## Usage
+
+复制文档url,执行如下命令:
+```
+python main.py -url https://www.yuque.com/burpheart/phpaudit
+```
+
+
+## 源码分析
+
+运行 main.py,获取url参数调用requests获取源码,查找如下网页源码:
+```
+<script nonce=wJM6HFxGFWlvqbg5UT1h>
+(function() {
+  window.appData = JSON.parse(decodeURIComponent("%7B%22me%22%3A%7B%xxxx7D"));
+})();
+</script>
+```
+
+可以发现,云雀将内容存储在window.appData中,我们只需要将其转换为json格式,即可获取到所有的文章内容。
+

+ 11 - 0
gui.py

@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/05/20 10:22:22
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   gui
+'''
+
+if __name__=='__main__':
+    pass

+ 13 - 0
main.py

@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/05/20 09:33:53
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   enter point
+'''
+
+from yuque import main
+
+if __name__ == '__main__':
+    main()

+ 100 - 0
main.ui

@@ -0,0 +1,100 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<ui version="4.0">
+ <class>MainWindow</class>
+ <widget class="QMainWindow" name="MainWindow">
+  <property name="geometry">
+   <rect>
+    <x>0</x>
+    <y>0</y>
+    <width>563</width>
+    <height>319</height>
+   </rect>
+  </property>
+  <property name="windowTitle">
+   <string>云雀知识库批量下载工具</string>
+  </property>
+  <widget class="QWidget" name="centralwidget">
+   <widget class="QLineEdit" name="lineEdit">
+    <property name="geometry">
+     <rect>
+      <x>190</x>
+      <y>100</y>
+      <width>321</width>
+      <height>20</height>
+     </rect>
+    </property>
+   </widget>
+   <widget class="QPushButton" name="pushButton">
+    <property name="geometry">
+     <rect>
+      <x>190</x>
+      <y>190</y>
+      <width>75</width>
+      <height>23</height>
+     </rect>
+    </property>
+    <property name="text">
+     <string>开始下载</string>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label">
+    <property name="geometry">
+     <rect>
+      <x>140</x>
+      <y>100</y>
+      <width>54</width>
+      <height>12</height>
+     </rect>
+    </property>
+    <property name="text">
+     <string>链接</string>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_2">
+    <property name="geometry">
+     <rect>
+      <x>140</x>
+      <y>20</y>
+      <width>251</width>
+      <height>51</height>
+     </rect>
+    </property>
+    <property name="font">
+     <font>
+      <family>Algerian</family>
+      <pointsize>16</pointsize>
+     </font>
+    </property>
+    <property name="text">
+     <string>云雀知识库批量下载工具</string>
+    </property>
+   </widget>
+   <widget class="QLabel" name="label_3">
+    <property name="geometry">
+     <rect>
+      <x>120</x>
+      <y>140</y>
+      <width>54</width>
+      <height>12</height>
+     </rect>
+    </property>
+    <property name="text">
+     <string>保存路径:</string>
+    </property>
+   </widget>
+   <widget class="QLineEdit" name="lineEdit_2">
+    <property name="geometry">
+     <rect>
+      <x>190</x>
+      <y>140</y>
+      <width>321</width>
+      <height>20</height>
+     </rect>
+    </property>
+   </widget>
+  </widget>
+  <widget class="QStatusBar" name="statusbar"/>
+ </widget>
+ <resources/>
+ <connections/>
+</ui>

+ 14 - 0
yuque/__init__.py

@@ -0,0 +1,14 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/05/20 09:34:58
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
+'''
+
+from yuque.yunque import YunQue
+
+
+def main():
+    YunQue().run()

+ 3 - 0
yuque/api.py

@@ -0,0 +1,3 @@
+
+host="https://www.yuque.com"    
+docs=host + "/api/docs/"

+ 115 - 0
yuque/yunque.py

@@ -0,0 +1,115 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/05/20 09:34:22
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
+'''
+import sys
+
+import requests
+import json
+import re
+import os
+import urllib.parse
+import logging
+import argparse
+from yuque import api
+
+class YunQue(object):
+    ''' 语雀知识库下载 '''
+    
+    def __init__(self):
+        self.sess=requests.Session()
+        self.logger = logging.getLogger(__name__)
+        self.logger.setLevel(logging.DEBUG)
+        self.formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        self.ch = logging.StreamHandler()
+        self.ch.setLevel(logging.DEBUG)
+        self.ch.setFormatter(self.formatter)
+        self.logger.addHandler(self.ch)
+        self.args = None
+        self.parser = argparse.ArgumentParser(description='yuque download')
+        self.parser.add_argument('-url', '--url', help='url', default='')
+        self.args = self.parser.parse_args()
+
+    def save_page(self, book_id, sulg, path):
+        ''' 保存文档 '''
+        docsdata = requests.get(api.docs + sulg + '?book_id=' + book_id + '&merge_dynamic_data=false&mode=markdown')
+        if (docsdata.status_code != 200):
+            print("文档下载失败 页面可能被删除 ", book_id, sulg, docsdata.content)
+            return
+        docsjson = json.loads(docsdata.content)
+
+        with open(path, 'w', encoding='utf-8') as f:
+            f.write(docsjson['data']['sourcecode'])
+
+    def get_book(self, url):
+        ''' 获取知识库 '''
+        try:
+            docsdata = requests.get(url)
+            data = re.findall(r"decodeURIComponent\(\"(.+)\"\)\);", docsdata.content.decode('utf-8'))
+            docsjson = json.loads(urllib.parse.unquote(data[0]))
+        except Exception as e:
+            self.logger.error("检查文档链接是否错误,"+e)
+        test = []
+        list = {}
+        temp = {}
+        md = ""
+        table = str.maketrans('\/:*?"<>|' + "\n\r", "___________")
+        prename = ""
+        if (os.path.exists("download/" + str(docsjson['book']['id'])) == False):
+            os.makedirs("download/" + str(docsjson['book']['id']))
+        # 遍历文档
+        for doc in docsjson['book']['toc']:
+            # 创建目录
+            if (doc['type'] == 'TITLE'):
+                filename = ''
+                list[doc['uuid']] = {'0': doc['title'], '1': doc['parent_uuid']}
+                uuid = doc['uuid']
+                temp[doc['uuid']] = ''
+                while True:
+                    if (list[uuid]['1'] != ''):
+                        if temp[doc['uuid']] == '':
+                            temp[doc['uuid']] = doc['title'].translate(table)
+                        else:
+                            temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
+                        uuid = list[uuid]['1']
+                    else:
+                        temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
+                        break
+                if ((os.path.exists("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])) == False):
+                    os.makedirs("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])
+                if (temp[doc['uuid']].endswith("/")):
+                    md += "## " + temp[doc['uuid']][:-1] + "\n"
+                else:
+                    md += "  " * (temp[doc['uuid']].count("/") - 1) + "* " + temp[doc['uuid']][
+                                                                            temp[doc['uuid']].rfind("/") + 1:] + "\n"
+            if (doc['url'] != ''):
+                if doc['parent_uuid'] != "":
+                    if (temp[doc['parent_uuid']].endswith("/")):
+                        md += " " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
+                            temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
+                    else:
+                        md += "  " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
+                            temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
+                    self.save_page(str(docsjson['book']['id']), doc['url'],
+                            "download/" + str(docsjson['book']['id']) + '/' + temp[doc['parent_uuid']] + "/" + doc[
+                                'title'].translate(table) + '.md')
+                else:
+                    md += " " + "* [" + doc['title'] + "](" + urllib.parse.quote(
+                        doc['title'].translate(table) + '.md') + ")" + "\n"
+                    self.save_page(str(docsjson['book']['id']), doc['url'],
+                            "download/" + str(docsjson['book']['id']) + "/" + doc[
+                                'title'].translate(table) + '.md')
+        with open("download/" + str(docsjson['book']['id']) + '/' + "/SUMMARY.md", 'w', encoding='utf-8') as f:
+            f.write(md)
+
+    def run(self):
+        ''' 获取文档 '''
+        if(self.args.url != ''):
+            url = self.args.url
+            self.get_book(url)
+        else:
+            url = input("请输入语雀文档链接:")