yunque.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2023/05/20 09:34:22
  6. @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
  7. @Desc :
  8. '''
  9. import sys
  10. import requests
  11. import json
  12. import re
  13. import os
  14. import urllib.parse
  15. import logging
  16. import argparse
  17. from yuque import api
  18. class YunQue(object):
  19. ''' 语雀知识库下载 '''
  20. def __init__(self):
  21. self.sess=requests.Session()
  22. self.logger = logging.getLogger(__name__)
  23. self.logger.setLevel(logging.DEBUG)
  24. self.formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
  25. self.ch = logging.StreamHandler()
  26. self.ch.setLevel(logging.DEBUG)
  27. self.ch.setFormatter(self.formatter)
  28. self.logger.addHandler(self.ch)
  29. self.args = None
  30. self.parser = argparse.ArgumentParser(description='yuque download')
  31. self.parser.add_argument('-url', '--url', help='url', default='')
  32. self.args = self.parser.parse_args()
  33. def save_page(self, book_id, sulg, path):
  34. ''' 保存文档 '''
  35. docsdata = requests.get(api.docs + sulg + '?book_id=' + book_id + '&merge_dynamic_data=false&mode=markdown')
  36. if (docsdata.status_code != 200):
  37. print("文档下载失败 页面可能被删除 ", book_id, sulg, docsdata.content)
  38. return
  39. docsjson = json.loads(docsdata.content)
  40. with open(path, 'w', encoding='utf-8') as f:
  41. f.write(docsjson['data']['sourcecode'])
  42. def get_book(self, url):
  43. ''' 获取知识库 '''
  44. try:
  45. docsdata = requests.get(url)
  46. data = re.findall(r"decodeURIComponent\(\"(.+)\"\)\);", docsdata.content.decode('utf-8'))
  47. docsjson = json.loads(urllib.parse.unquote(data[0]))
  48. except Exception as e:
  49. self.logger.error("检查文档链接是否错误,"+e)
  50. test = []
  51. list = {}
  52. temp = {}
  53. md = ""
  54. table = str.maketrans('\/:*?"<>|' + "\n\r", "___________")
  55. prename = ""
  56. if (os.path.exists("download/" + str(docsjson['book']['id'])) == False):
  57. os.makedirs("download/" + str(docsjson['book']['id']))
  58. # 遍历文档
  59. for doc in docsjson['book']['toc']:
  60. # 创建目录
  61. if (doc['type'] == 'TITLE'):
  62. filename = ''
  63. list[doc['uuid']] = {'0': doc['title'], '1': doc['parent_uuid']}
  64. uuid = doc['uuid']
  65. temp[doc['uuid']] = ''
  66. while True:
  67. if (list[uuid]['1'] != ''):
  68. if temp[doc['uuid']] == '':
  69. temp[doc['uuid']] = doc['title'].translate(table)
  70. else:
  71. temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
  72. uuid = list[uuid]['1']
  73. else:
  74. temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
  75. break
  76. if ((os.path.exists("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])) == False):
  77. os.makedirs("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])
  78. if (temp[doc['uuid']].endswith("/")):
  79. md += "## " + temp[doc['uuid']][:-1] + "\n"
  80. else:
  81. md += " " * (temp[doc['uuid']].count("/") - 1) + "* " + temp[doc['uuid']][
  82. temp[doc['uuid']].rfind("/") + 1:] + "\n"
  83. if (doc['url'] != ''):
  84. if doc['parent_uuid'] != "":
  85. if (temp[doc['parent_uuid']].endswith("/")):
  86. md += " " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
  87. temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
  88. else:
  89. md += " " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
  90. temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
  91. self.save_page(str(docsjson['book']['id']), doc['url'],
  92. "download/" + str(docsjson['book']['id']) + '/' + temp[doc['parent_uuid']] + "/" + doc[
  93. 'title'].translate(table) + '.md')
  94. else:
  95. md += " " + "* [" + doc['title'] + "](" + urllib.parse.quote(
  96. doc['title'].translate(table) + '.md') + ")" + "\n"
  97. self.save_page(str(docsjson['book']['id']), doc['url'],
  98. "download/" + str(docsjson['book']['id']) + "/" + doc[
  99. 'title'].translate(table) + '.md')
  100. with open("download/" + str(docsjson['book']['id']) + '/' + "/SUMMARY.md", 'w', encoding='utf-8') as f:
  101. f.write(md)
  102. def run(self):
  103. ''' 获取文档 '''
  104. if(self.args.url != ''):
  105. url = self.args.url
  106. self.get_book(url)
  107. else:
  108. url = input("请输入语雀文档链接:")