fish 2 months ago
parent
commit
4668d91ff9
5 changed files with 119 additions and 50 deletions
  1. 1 0
      .gitignore
  2. 18 0
      README.md
  3. 71 48
      main.py
  4. 3 2
      requirements.txt
  5. 26 0
      start_gooexport.bat

+ 1 - 0
.gitignore

@@ -58,3 +58,4 @@ docs/_build/
 # PyBuilder
 target/
 
+data/*.xml

+ 18 - 0
README.md

@@ -15,3 +15,21 @@ resultFile=xx
  爬虫数据放到 data 目录中。
 
 4. python main.py
+
+```
+virtualenv .venv
+source .venv/bin/activate
+
+pip install pyinstaller
+
+pip install -r requirements.txt
+
+python main.py -i /workspace/gooExport/data
+
+pyinstaller --onefile main.py
+```
+
+
+## License
+
+## References

+ 71 - 48
main.py

@@ -1,54 +1,77 @@
-#coding=utf-8
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
 '''
-unicodecsv,bs4,lxml
-Created on 2017年6月26日
-@vsersion:python3.6
-@author: liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2017年6月26日
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   
 '''
-from nt import chdir, listdir
 
+import os
 import unicodecsv as csv
 from bs4 import BeautifulSoup
+import pandas as pd
+class GooExport(object):
+    """ docstring for GooExport """
 
-dataPath="D:\\t"
-resultFile="D:\\result.csv"
-
-  
-def do_list_dir():
-    for lists in listdir(dataPath):
-        saveData(dataPath+"\\"+lists)
-
-def saveData(dataFile):
-    #打开dataFile文件
-    with open(dataFile, 'r',encoding='utf8', errors = 'replace') as f:
-        xml_doc =f.read()   #读取xml文本内容
-    #去除空格和换行
-    xml_doc=xml_doc.replace("\n", "")
-    xml_doc=xml_doc.replace(" ", "")
-    #xml形式读取
-    soup = BeautifulSoup(xml_doc,"xml")
-    
-    #打开resultFile写文件
-    csvfile = open(resultFile, 'ab')# r只读,w可写,a追加 b二进制读写
-    writer = csv.writer(csvfile, dialect='excel', encoding='gb18030',errors="ignore")
-    needData=soup.findChild("box1")
-    for i in range(len(needData.contents)):
-        data=[]
-        flag=True
-        for j in range(len(needData.contents[i].contents)):
-            try:
-                data.append(needData.contents[i].contents[j].contents[0])
-            except Exception as e:
-                print(e)
-                flag=False
-        if flag==True:
-            writer.writerow(data)    
-    csvfile.close()    
-            
-def main():
-    print("start!")
-    chdir(dataPath)
-    do_list_dir()
-    print("finish!")
-
-main()
+    def __init__(self, inputDataPath, outResultFile):
+        self.inputDataPath=inputDataPath
+        self.outResultFile=outResultFile
+        self.job = pd.DataFrame()
+
+    def parseXml(self, dataFile):
+        # print(dataFile,"------------------")
+        with open(dataFile, 'r',encoding='utf8', errors = 'replace') as f:
+            xml_doc =f.read()   #读取xml文本内容
+        # 去除空格和换行
+        xml_doc=xml_doc.replace("\n", "")
+        xml_doc=xml_doc.replace(" ", "")
+        # xml 形式读取
+        soup = BeautifulSoup(xml_doc, "lxml-xml")
+        
+        needData=soup.findChild("job")
+        for i in range(len(needData.contents)):
+            data=dict()
+            for j in range(len(needData.contents[i].contents)):
+                try:
+                    key=needData.contents[i].contents[j].name
+                    value=needData.contents[i].contents[j].contents[0]
+                    data[key]=value
+                except Exception as e:
+                    print(e)
+            # 把data加入到job(dataframe)中,其中key是列名,value是数据,可能会缺失字段
+            new_df = pd.DataFrame([data])  # Convert the data to a DataFrame
+            self.job = pd.concat([self.job, new_df], ignore_index=True)
+        self.removeDuplication()
+
+    def removeDuplication(self):
+        # print("removeDuplication")
+        self.job=self.job.drop_duplicates()
+    def run(self):
+        print("start!")
+        os.chdir(self.inputDataPath)
+        for lists in os.listdir(self.inputDataPath):
+            if os.path.isfile(os.path.join(self.inputDataPath, lists)) and lists.endswith(".xml"):
+                self.parseXml(os.path.join(self.inputDataPath, lists))
+        with open(self.outResultFile, 'wb') as f:
+            writer = csv.writer(f, dialect='excel', encoding='gb18030',errors="ignore")
+            writer.writerow(self.job.columns)
+            writer.writerows(self.job.values)
+        print("finish!")
+
+if __name__=='__main__':
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser()
+    parser.add_argument("-i", "--inputDataPath", help="inputDataPath", default=".", type=str)
+    parser.add_argument("-o", "--outResultFile", help="outResultFile", default="/workspace/result.csv", type=str)
+
+    args = parser.parse_args()
+    inputDataPath=args.inputDataPath
+    outResultFile=args.outResultFile
+
+    if inputDataPath=="" or outResultFile=="":
+        print("please input inputDataPath and outResultFile")
+        exit()
+    gooExport = GooExport(inputDataPath, outResultFile)
+    gooExport.run()

+ 3 - 2
requirements.txt

@@ -1,3 +1,4 @@
-beautifulsoup4==4.5.3
+beautifulsoup4==4.12.0
 unicodecsv==0.14.1
-lxml
+lxml
+pandas

+ 26 - 0
start_gooexport.bat

@@ -0,0 +1,26 @@
+@echo off
+REM ***************************************************************************
+REM @Contact :   liuyuqi.gov@msn.cn
+REM @Time    :   2024/12/19 02:55:35
+REM @Version :   1.0
+REM @License :   (C)Copyright 2019 liuyuqi.
+REM @Desc    :   提示用户输入i 和 o ,执行命令 python main.py -i %1 -o %2
+REM @File    :   start_gooexport.bat
+REM @Author  :   liuyuqi
+REM @History :
+REM %1 - ext_name
+REM %2 - characters replaced
+REM %3 - new characters
+REM ***************************************************************************
+
+input = input("请输入i和o,执行命令 python main.py -i %1 -o %2")
+output = input("请输入i和o,执行命令 python main.py -i %1 -o %2")
+    
+python main.py -i %1 -o %2
+
+pause
+
+exit
+
+
+