clean_cn.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. """
  2. this script using for clean Chinese corpus.
  3. you can set level for clean, i.e.:
  4. level='all', will clean all character that not Chinese, include punctuations
  5. level='normal', this will generate corpus like normal use, reserve alphabets and numbers
  6. level='clean', this will remove all except Chinese and Chinese punctuations
  7. besides, if you want remove complex Chinese characters, just set this to be true:
  8. simple_only=True
  9. """
  10. import numpy as np
  11. import os
  12. import string
  13. cn_punctuation_set = [',', '。', '!', '?', '"', '"', '、']
  14. en_punctuation_set = [',', '.', '?', '!', '"', '"']
  15. def clean_cn_corpus(file_name, clean_level='all', simple_only=True, is_save=True):
  16. """
  17. clean Chinese corpus.
  18. :param file_name:
  19. :param clean_level:
  20. :param simple_only:
  21. :param is_save:
  22. :return: clean corpus in list type.
  23. """
  24. if os.path.dirname(file_name):
  25. base_dir = os.path.dirname(file_name)
  26. else:
  27. print('not set dir. please check')
  28. save_file = os.path.join(base_dir, os.path.basename(file_name).split('.')[0] + '_cleaned.txt')
  29. with open(file_name, 'r+') as f:
  30. clean_content = []
  31. for l in f.readlines():
  32. l = l.strip()
  33. if l == '':
  34. pass
  35. else:
  36. l = list(l)
  37. should_remove_words = []
  38. for w in l:
  39. if not should_reserve(w, clean_level):
  40. should_remove_words.append(w)
  41. clean_line = [c for c in l if c not in should_remove_words]
  42. clean_line = ''.join(clean_line)
  43. if clean_line != '':
  44. clean_content.append(clean_line)
  45. if is_save:
  46. with open(save_file, 'w+') as f:
  47. for l in clean_content:
  48. f.write(l + '\n')
  49. print('[INFO] cleaned file have been saved to %s.' % save_file)
  50. return clean_content
  51. def should_reserve(w, clean_level):
  52. if w == ' ':
  53. return True
  54. else:
  55. if clean_level == 'all':
  56. # only reserve Chinese characters
  57. if w in cn_punctuation_set or w in string.punctuation or is_alphabet(w):
  58. return False
  59. else:
  60. return is_chinese(w)
  61. elif clean_level == 'normal':
  62. # reserve Chinese characters, English alphabet, number
  63. if is_chinese(w) or is_alphabet(w) or is_number(w):
  64. return True
  65. elif w in cn_punctuation_set or w in en_punctuation_set:
  66. return True
  67. else:
  68. return False
  69. elif clean_level == 'clean':
  70. if is_chinese(w):
  71. return True
  72. elif w in cn_punctuation_set:
  73. return True
  74. else:
  75. return False
  76. else:
  77. raise "clean_level not support %s, please set for all, normal, clean" % clean_level
  78. def is_chinese(uchar):
  79. """is chinese"""
  80. if u'\u4e00' <= uchar <= u'\u9fa5':
  81. return True
  82. else:
  83. return False
  84. def is_number(uchar):
  85. """is number"""
  86. if u'\u0030' <= uchar <= u'\u0039':
  87. return True
  88. else:
  89. return False
  90. def is_alphabet(uchar):
  91. """is alphabet"""
  92. if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'):
  93. return True
  94. else:
  95. return False
  96. def semi_angle_to_sbc(uchar):
  97. """半角转全角"""
  98. inside_code = ord(uchar)
  99. if inside_code < 0x0020 or inside_code > 0x7e:
  100. return uchar
  101. if inside_code == 0x0020:
  102. inside_code = 0x3000
  103. else:
  104. inside_code += 0xfee0
  105. return chr(inside_code)
  106. def sbc_to_semi_angle(uchar):
  107. """全角转半角"""
  108. inside_code = ord(uchar)
  109. if inside_code == 0x3000:
  110. inside_code = 0x0020
  111. else:
  112. inside_code -= 0xfee0
  113. if inside_code < 0x0020 or inside_code > 0x7e:
  114. return uchar
  115. return chr(inside_code)