123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2023/05/20 09:34:22
- @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
- @Desc :
- '''
- import sys
- import requests
- import json
- import re
- import os
- import urllib.parse
- import logging
- import argparse
- from yuque import api
- class YunQue(object):
- ''' 语雀知识库下载 '''
-
- def __init__(self):
- self.sess=requests.Session()
- self.logger = logging.getLogger(__name__)
- self.logger.setLevel(logging.DEBUG)
- self.formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- self.ch = logging.StreamHandler()
- self.ch.setLevel(logging.DEBUG)
- self.ch.setFormatter(self.formatter)
- self.logger.addHandler(self.ch)
- self.args = None
- self.parser = argparse.ArgumentParser(description='yuque download')
- self.parser.add_argument('-url', '--url', help='url', default='')
- self.args = self.parser.parse_args()
- def save_page(self, book_id, sulg, path):
- ''' 保存文档 '''
- docsdata = requests.get(api.docs + sulg + '?book_id=' + book_id + '&merge_dynamic_data=false&mode=markdown')
- if (docsdata.status_code != 200):
- print("文档下载失败 页面可能被删除 ", book_id, sulg, docsdata.content)
- return
- docsjson = json.loads(docsdata.content)
- with open(path, 'w', encoding='utf-8') as f:
- f.write(docsjson['data']['sourcecode'])
- def get_book(self, url):
- ''' 获取知识库 '''
- try:
- docsdata = requests.get(url)
- data = re.findall(r"decodeURIComponent\(\"(.+)\"\)\);", docsdata.content.decode('utf-8'))
- docsjson = json.loads(urllib.parse.unquote(data[0]))
- except Exception as e:
- self.logger.error("检查文档链接是否错误,"+e)
- test = []
- list = {}
- temp = {}
- md = ""
- table = str.maketrans('\/:*?"<>|' + "\n\r", "___________")
- prename = ""
- if (os.path.exists("download/" + str(docsjson['book']['id'])) == False):
- os.makedirs("download/" + str(docsjson['book']['id']))
- # 遍历文档
- for doc in docsjson['book']['toc']:
- # 创建目录
- if (doc['type'] == 'TITLE'):
- filename = ''
- list[doc['uuid']] = {'0': doc['title'], '1': doc['parent_uuid']}
- uuid = doc['uuid']
- temp[doc['uuid']] = ''
- while True:
- if (list[uuid]['1'] != ''):
- if temp[doc['uuid']] == '':
- temp[doc['uuid']] = doc['title'].translate(table)
- else:
- temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
- uuid = list[uuid]['1']
- else:
- temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
- break
- if ((os.path.exists("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])) == False):
- os.makedirs("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])
- if (temp[doc['uuid']].endswith("/")):
- md += "## " + temp[doc['uuid']][:-1] + "\n"
- else:
- md += " " * (temp[doc['uuid']].count("/") - 1) + "* " + temp[doc['uuid']][
- temp[doc['uuid']].rfind("/") + 1:] + "\n"
- if (doc['url'] != ''):
- if doc['parent_uuid'] != "":
- if (temp[doc['parent_uuid']].endswith("/")):
- md += " " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
- temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
- else:
- md += " " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
- temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
- self.save_page(str(docsjson['book']['id']), doc['url'],
- "download/" + str(docsjson['book']['id']) + '/' + temp[doc['parent_uuid']] + "/" + doc[
- 'title'].translate(table) + '.md')
- else:
- md += " " + "* [" + doc['title'] + "](" + urllib.parse.quote(
- doc['title'].translate(table) + '.md') + ")" + "\n"
- self.save_page(str(docsjson['book']['id']), doc['url'],
- "download/" + str(docsjson['book']['id']) + "/" + doc[
- 'title'].translate(table) + '.md')
- with open("download/" + str(docsjson['book']['id']) + '/' + "/SUMMARY.md", 'w', encoding='utf-8') as f:
- f.write(md)
- def run(self):
- ''' 获取文档 '''
- if(self.args.url != ''):
- url = self.args.url
- self.get_book(url)
- else:
- url = input("请输入语雀文档链接:")
|