|
@@ -6,17 +6,112 @@
|
|
|
@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
|
@Desc : None
|
|
|
'''
|
|
|
-import os,sys,re,requests
|
|
|
+import sys
|
|
|
|
|
|
-class CrawlYuque(object):
|
|
|
- """ Crawl Yuque API """
|
|
|
+import requests
|
|
|
+import json
|
|
|
+import re
|
|
|
+import os
|
|
|
+import urllib.parse
|
|
|
+import logging
|
|
|
+import argparse
|
|
|
+from yuque import api
|
|
|
|
|
|
+class YuQue(object):
|
|
|
+ ''' 语雀知识库下载 '''
|
|
|
+
|
|
|
def __init__(self):
|
|
|
- """ init """
|
|
|
self.sess=requests.Session()
|
|
|
-
|
|
|
+
|
|
|
+ self.logger = logging.getLogger(__name__)
|
|
|
+ self.logger.setLevel(logging.DEBUG)
|
|
|
+ self.formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
+ self.ch = logging.StreamHandler()
|
|
|
+ self.ch.setLevel(logging.DEBUG)
|
|
|
+ self.ch.setFormatter(self.formatter)
|
|
|
+ self.logger.addHandler(self.ch)
|
|
|
+
|
|
|
+ self.args = None
|
|
|
+ self.parser = argparse.ArgumentParser(description='yuque download')
|
|
|
+ self.parser.add_argument('-url', '--url', help='url', default='')
|
|
|
+ self.args = self.parser.parse_args()
|
|
|
+
|
|
|
+ def save_page(self, book_id, sulg, path):
|
|
|
+ ''' 保存文档 '''
|
|
|
+ docsdata = requests.get(api.docs + sulg + '?book_id=' + book_id + '&merge_dynamic_data=false&mode=markdown')
|
|
|
+ if (docsdata.status_code != 200):
|
|
|
+ print("文档下载失败 页面可能被删除 ", book_id, sulg, docsdata.content)
|
|
|
+ return
|
|
|
+ docsjson = json.loads(docsdata.content)
|
|
|
+
|
|
|
+ with open(path, 'w', encoding='utf-8') as f:
|
|
|
+ f.write(docsjson['data']['sourcecode'])
|
|
|
+
|
|
|
+ def get_book(self, url):
|
|
|
+ ''' 获取知识库 '''
|
|
|
+ try:
|
|
|
+ docsdata = requests.get(url)
|
|
|
+ data = re.findall(r"decodeURIComponent\(\"(.+)\"\)\);", docsdata.content.decode('utf-8'))
|
|
|
+ docsjson = json.loads(urllib.parse.unquote(data[0]))
|
|
|
+ except Exception as e:
|
|
|
+ self.logger.error("检查文档链接是否错误,"+e)
|
|
|
+ test = []
|
|
|
+ list = {}
|
|
|
+ temp = {}
|
|
|
+ md = ""
|
|
|
+ table = str.maketrans('\/:*?"<>|' + "\n\r", "___________")
|
|
|
+ prename = ""
|
|
|
+ if (os.path.exists("download/" + str(docsjson['book']['id'])) == False):
|
|
|
+ os.makedirs("download/" + str(docsjson['book']['id']))
|
|
|
+ # 遍历文档
|
|
|
+ for doc in docsjson['book']['toc']:
|
|
|
+ # 创建目录
|
|
|
+ if (doc['type'] == 'TITLE'):
|
|
|
+ filename = ''
|
|
|
+ list[doc['uuid']] = {'0': doc['title'], '1': doc['parent_uuid']}
|
|
|
+ uuid = doc['uuid']
|
|
|
+ temp[doc['uuid']] = ''
|
|
|
+ while True:
|
|
|
+ if (list[uuid]['1'] != ''):
|
|
|
+ if temp[doc['uuid']] == '':
|
|
|
+ temp[doc['uuid']] = doc['title'].translate(table)
|
|
|
+ else:
|
|
|
+ temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
|
|
|
+ uuid = list[uuid]['1']
|
|
|
+ else:
|
|
|
+ temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
|
|
|
+ break
|
|
|
+ if ((os.path.exists("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])) == False):
|
|
|
+ os.makedirs("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])
|
|
|
+ if (temp[doc['uuid']].endswith("/")):
|
|
|
+ md += "## " + temp[doc['uuid']][:-1] + "\n"
|
|
|
+ else:
|
|
|
+ md += " " * (temp[doc['uuid']].count("/") - 1) + "* " + temp[doc['uuid']][
|
|
|
+ temp[doc['uuid']].rfind("/") + 1:] + "\n"
|
|
|
+ if (doc['url'] != ''):
|
|
|
+ if doc['parent_uuid'] != "":
|
|
|
+ if (temp[doc['parent_uuid']].endswith("/")):
|
|
|
+ md += " " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
|
|
|
+ temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
|
|
|
+ else:
|
|
|
+ md += " " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
|
|
|
+ temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
|
|
|
+ self.save_page(str(docsjson['book']['id']), doc['url'],
|
|
|
+ "download/" + str(docsjson['book']['id']) + '/' + temp[doc['parent_uuid']] + "/" + doc[
|
|
|
+ 'title'].translate(table) + '.md')
|
|
|
+ else:
|
|
|
+ md += " " + "* [" + doc['title'] + "](" + urllib.parse.quote(
|
|
|
+ doc['title'].translate(table) + '.md') + ")" + "\n"
|
|
|
+ self.save_page(str(docsjson['book']['id']), doc['url'],
|
|
|
+ "download/" + str(docsjson['book']['id']) + "/" + doc[
|
|
|
+ 'title'].translate(table) + '.md')
|
|
|
+ with open("download/" + str(docsjson['book']['id']) + '/' + "/SUMMARY.md", 'w', encoding='utf-8') as f:
|
|
|
+ f.write(md)
|
|
|
+
|
|
|
def run(self):
|
|
|
- pass
|
|
|
-
|
|
|
- def get_token(self):
|
|
|
- pass
|
|
|
+ ''' 获取文档 '''
|
|
|
+ if(self.args.url != ''):
|
|
|
+ url = self.args.url
|
|
|
+ self.get_book(url)
|
|
|
+ else:
|
|
|
+ url = input("请输入语雀文档链接:")
|