1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
| import operator from os import path from PIL import Image import numpy as np import matplotlib.pyplot as plt from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator import jieba
def get_poems(name_txt): with open(name_txt, encoding='utf-8') as f: poems_text=f.read() poems = poems_text.split('\n') poems = list(filter(None,poems)) return f.name , poems
def mk_single(single_txt): singled = [] for i in single_txt: single_poem = i.split(' ') singled.append(single_poem[3]) singled = ''.join(singled) return singled
def rm_symbol(words_txt): unsymbol = [] for i in words_txt: if i.isalpha(): unsymbol.append(i) return unsymbol
def mk_dict(dic_data): mked_dic={} qc_data=list(set(dic_data)) for i in qc_data: mked_dic.setdefault(i,dic_data.count(i)) mked_dic = sorted(mked_dic.items(), key = operator.itemgetter(1)) mked_dic.reverse() mked_dic=dict(mked_dic) return mked_dic
def tj_core(origin_data): poems_ab=get_poems(origin_data) poems_author=str(poems_ab[0]) poems_author=poems_author.split('.',1) poems=poems_ab[1] poems_sg=mk_single(poems) poems_wd=rm_symbol(poems_sg) poems_dic=mk_dict(poems_wd) return poems_author[0], poems ,poems_dic
def tj_HFWD(tj_author,tj_poems,tj_dic,pl_min): num_total=len(tj_poems) tj_LS=[] for i in tj_dic: if tj_dic[i]/num_total >= pl_min: tj_LS.append(i) return set(tj_LS)
def tj_frequence(tj_author,tj_poems,tj_dic,pl_min): num_total=len(tj_poems) print('---------BEGIN---------') print(tj_author,'----汉字词频','(>=',pl_min,')') for i in tj_dic: if tj_dic[i]/num_total >= pl_min: print(i, ' '*2, '{:.8f}'.format(tj_dic[i]/num_total)) print('----------END----------')
def pome_cloud(in_poem,in_mask,d,wd_num,wds_max,wdf_siz): poems_ab=get_poems(in_poem) poems_author=str(poems_ab[0]) poems_author=poems_author.split('.',1) poems=poems_ab[1] poems_sg=mk_single(poems) if wd_num == 1: poems_sg=' '.join(list(poems_sg)) elif wd_num == 2: poems_sg=' '.join(jieba.cut(poems_sg)) mask_png = np.array(Image.open(path.join(d,in_mask))) wa = WordCloud(background_color="white", font_path='/usr/share/fonts/windows10/simsun.ttc', max_words=wds_max, mask=mask_png, max_font_size=wdf_siz, min_word_length=wd_num).generate(poems_sg) print(poems_author[0],'----词云',wd_num) plt.imshow(wa,interpolation="bilinear") plt.axis("off") plt.show()
min_f=0.1 LB=tj_core('李白.txt') DF=tj_core('杜甫.txt') LB_frequent=tj_HFWD(LB[0],LB[1],LB[2],min_f) DF_frequent=tj_HFWD(DF[0],DF[1],DF[2],min_f)
tj_frequence(LB[0],LB[1],LB[2],min_f) print('\n')
tj_frequence(DF[0],DF[1],DF[2],min_f) print('\n')
m = 1 print('两位诗人都爱用的字有{}个,这些字为:'.format(len(LB_frequent & DF_frequent))) for i in LB_frequent & DF_frequent: print(i, end = ' ') if m % 10 == 0: print() m += 1 print('\n')
n = 1 print('李白爱用但杜甫不爱用的字有{}个,这些字为:'.format(len(LB_frequent - DF_frequent))) for i in LB_frequent - DF_frequent: print(i, end = ' ') if n % 10 == 0: print() n += 1 print('\n')
k = 1 print('杜甫爱用但李白不爱用的字有{}个,这些字为:'.format(len(DF_frequent - LB_frequent))) for i in DF_frequent - LB_frequent: print(i, end = ' ') if k % 10 == 0: print() k += 1 print('\n')
poem_path = '/home/feng' poem_mask="libai.png" mask_png = np.array(Image.open(path.join(poem_path,poem_mask))) plt.imshow(mask_png,interpolation="bilinear") plt.axis("off") plt.show() pome_cloud('李白.txt',poem_mask,poem_path,1,100,40) pome_cloud('李白.txt',poem_mask,poem_path,2,100,40) pome_cloud('杜甫.txt',poem_mask,poem_path,1,100,40) pome_cloud('杜甫.txt',poem_mask,poem_path,2,100,40)
|