主要是改进了mergeStatic函数。觉得老师提供的答案里,两个文章相互统计步骤有点重复。还有就是筛选单词的时候使用里正则表达式里的re.subz()去分割单词,但是没有答案里使用匹配来的便捷。#coding:utf-8
#统计一堆文章的热门词汇
import re
#先把文章读出来
def readFile(filename):
f = open(filename, 'r')
ff = f.readlines()
f.close()
f1 = ''.join(ff) # 将文章逐行读出后连接成字符串
f2 = re.sub('[,\.";]', '', f1) # 去掉文章里的符号
f2 = re.sub(' ', ' ', f2)
f2 = re.sub('\r\n', ' ', f2)
f2 = f2.lower()
words = f2.split(' ') # 将文章按字母分开成列表
#统计词频
freq_list = []
word_saved = []
for word in words:
if not word in word_saved:
word_saved.append(word)
freq_list.append((word, words.count(word)))
sorted_list = sorted(freq_list, key=lambda x: x[1], reverse=True)
#print sorted_list
return sorted_list
#融合
def mergeStatic(list1,list2):
word1,num1 = zip(*list1)
num1 = list(num1)
word1 = list(word1)
for word,num in list2:
if not word in word1:
word1.append(word)
num1.append(num)
else:
index = word1.index(word)
num1[index] = num+num1[index]
list1 = zip(word1,num1)
# 排序
sorted_list = sorted(list1, key=lambda x: x[1], reverse=True)
return sorted_list
if __name__=="__main__":
file_list = ['article_000.txt','article_001.txt','article_002.txt','article_003.txt','article_004.txt','article_005.txt']
cc=map(readFile,file_list)
word_list = reduce(mergeStatic,cc)
print "最常用的单词排行榜:"
for word in word_list[0:10]:
print "%-10s %d" % (word[0], word[1])
#最常用的单词排行榜:
#the 106
#of 68
#to 51
#and 48
#a 39
#is 35
#it 28
#you 25
#in 24
#your 21