main.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. #coding=utf-8
  2. '''
  3. unicodecsv,bs4,lxml
  4. Created on 2017年6月26日
  5. @vsersion:python3.6
  6. @author: liuyuqi
  7. '''
  8. from nt import chdir, listdir
  9. import unicodecsv as csv
  10. from bs4 import BeautifulSoup
  11. dataPath="D:\\t"
  12. resultFile="D:\\result.csv"
  13. def do_list_dir():
  14. for lists in listdir(dataPath):
  15. saveData(dataPath+"\\"+lists)
  16. def saveData(dataFile):
  17. #打开dataFile文件
  18. with open(dataFile, 'r',encoding='utf8', errors = 'replace') as f:
  19. xml_doc =f.read() #读取xml文本内容
  20. #去除空格和换行
  21. xml_doc=xml_doc.replace("\n", "")
  22. xml_doc=xml_doc.replace(" ", "")
  23. #xml形式读取
  24. soup = BeautifulSoup(xml_doc,"xml")
  25. #打开resultFile写文件
  26. csvfile = open(resultFile, 'ab')# r只读,w可写,a追加 b二进制读写
  27. writer = csv.writer(csvfile, dialect='excel', encoding='gb18030',errors="ignore")
  28. needData=soup.findChild("box1")
  29. for i in range(len(needData.contents)):
  30. data=[]
  31. flag=True
  32. for j in range(len(needData.contents[i].contents)):
  33. try:
  34. data.append(needData.contents[i].contents[j].contents[0])
  35. except Exception as e:
  36. print(e)
  37. flag=False
  38. if flag==True:
  39. writer.writerow(data)
  40. csvfile.close()
  41. def main():
  42. print("start!")
  43. chdir(dataPath)
  44. do_list_dir()
  45. print("finish!")
  46. main()