lyq
/
crawl_yunque


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115
							#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@Contact :   liuyuqi.gov@msn.cn
@Time    :   2023/05/20 09:34:22
@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
@Desc    :   
'''
import sys

import requests
import json
import re
import os
import urllib.parse
import logging
import argparse
from yuque import api

class YunQue(object):
    ''' 语雀知识库下载 '''
    
    def __init__(self):
        self.sess=requests.Session()
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.DEBUG)
        self.formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        self.ch = logging.StreamHandler()
        self.ch.setLevel(logging.DEBUG)
        self.ch.setFormatter(self.formatter)
        self.logger.addHandler(self.ch)
        self.args = None
        self.parser = argparse.ArgumentParser(description='yuque download')
        self.parser.add_argument('-url', '--url', help='url', default='')
        self.args = self.parser.parse_args()

    def save_page(self, book_id, sulg, path):
        ''' 保存文档 '''
        docsdata = requests.get(api.docs + sulg + '?book_id=' + book_id + '&merge_dynamic_data=false&mode=markdown')
        if (docsdata.status_code != 200):
            print("文档下载失败 页面可能被删除 ", book_id, sulg, docsdata.content)
            return
        docsjson = json.loads(docsdata.content)

        with open(path, 'w', encoding='utf-8') as f:
            f.write(docsjson['data']['sourcecode'])

    def get_book(self, url):
        ''' 获取知识库 '''
        try:
            docsdata = requests.get(url)
            data = re.findall(r"decodeURIComponent\(\"(.+)\"\)\);", docsdata.content.decode('utf-8'))
            docsjson = json.loads(urllib.parse.unquote(data[0]))
        except Exception as e:
            self.logger.error("检查文档链接是否错误，"+e)
        test = []
        list = {}
        temp = {}
        md = ""
        table = str.maketrans('\/:*?"<>|' + "\n\r", "___________")
        prename = ""
        if (os.path.exists("download/" + str(docsjson['book']['id'])) == False):
            os.makedirs("download/" + str(docsjson['book']['id']))
        # 遍历文档
        for doc in docsjson['book']['toc']:
            # 创建目录
            if (doc['type'] == 'TITLE'):
                filename = ''
                list[doc['uuid']] = {'0': doc['title'], '1': doc['parent_uuid']}
                uuid = doc['uuid']
                temp[doc['uuid']] = ''
                while True:
                    if (list[uuid]['1'] != ''):
                        if temp[doc['uuid']] == '':
                            temp[doc['uuid']] = doc['title'].translate(table)
                        else:
                            temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
                        uuid = list[uuid]['1']
                    else:
                        temp[doc['uuid']] = list[uuid]['0'].translate(table) + '/' + temp[doc['uuid']]
                        break
                if ((os.path.exists("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])) == False):
                    os.makedirs("download/" + str(docsjson['book']['id']) + '/' + temp[doc['uuid']])
                if (temp[doc['uuid']].endswith("/")):
                    md += "## " + temp[doc['uuid']][:-1] + "\n"
                else:
                    md += "  " * (temp[doc['uuid']].count("/") - 1) + "* " + temp[doc['uuid']][
                                                                            temp[doc['uuid']].rfind("/") + 1:] + "\n"
            if (doc['url'] != ''):
                if doc['parent_uuid'] != "":
                    if (temp[doc['parent_uuid']].endswith("/")):
                        md += " " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
                            temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
                    else:
                        md += "  " * temp[doc['parent_uuid']].count("/") + "* [" + doc['title'] + "](" + urllib.parse.quote(
                            temp[doc['parent_uuid']] + "/" + doc['title'].translate(table) + '.md') + ")" + "\n"
                    self.save_page(str(docsjson['book']['id']), doc['url'],
                            "download/" + str(docsjson['book']['id']) + '/' + temp[doc['parent_uuid']] + "/" + doc[
                                'title'].translate(table) + '.md')
                else:
                    md += " " + "* [" + doc['title'] + "](" + urllib.parse.quote(
                        doc['title'].translate(table) + '.md') + ")" + "\n"
                    self.save_page(str(docsjson['book']['id']), doc['url'],
                            "download/" + str(docsjson['book']['id']) + "/" + doc[
                                'title'].translate(table) + '.md')
        with open("download/" + str(docsjson['book']['id']) + '/' + "/SUMMARY.md", 'w', encoding='utf-8') as f:
            f.write(md)

    def run(self):
        ''' 获取文档 '''
        if(self.args.url != ''):
            url = self.args.url
            self.get_book(url)
        else:
            url = input("请输入语雀文档链接：")