请先加载
sent1=['call','me','David','.']
sent1
len(sent1)
print(sent2)
['The', 'family', 'of', 'Dashwood', 'had', 'long', 'been', 'settled', 'in', 'Sussex', '.']
In [28]:
print(sent3)
['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven', 'and', 'the', 'earth', '.']
In [45]:
a=sent1+sent4
print(a)
text4.index('awaken')
['call', 'me', 'David', '.', 'Fellow', '-', 'Citizens', 'of', 'the', 'Senate', 'and', 'of', 'the', 'House', 'of', 'Representatives', ':']
Out[45]:
173
In [29]:
fdist1 = FreqDist(text1)
In [30]:
fdist1
Out[30]:
FreqDist({'perusal': 1,
'brimmed': 2,
'puffs': 5,
'smile': 5,
'rather': 68,
'likened': 4,
'dispel': 1,
'Praetorians': 1,
。。。。。。。。。。
'prejudices': 3,
'lower': 57,
'foster': 1,
'cranes': 3,
'thine': 17,
'trodden': 1,
'whereof': 4,
'Fifth': 2,
'robes': 5,
...})
In [34]:
vocabulary1 =list(fdist1.keys())
vocabulary1[:50]
Out[34]:
['wards',
'Pythagoras',
'snowy',
'perhaps',
'puffs',
'vanquished',
'135',
'smile',
'half',
'articles',
'jackets',
'fickleness',
'rather',
'sufficiently',
'Republican',
'bunting',
'sued',
'spacious',
'dispel',
'digger',
'secret',
'uncouth',
'justification',
'staunch',
'perusal',
'stones',
'footfall',
'Prodigies',
'ticklish',
'patris',
'indifferently',
'insinuates',
'hastily',
'Hurriedly',
'linen',
'deviations',
'Desolation',
'ruling',
'shipmate',
'reminds',
'revolved',
'engrafted',
'forthing',
'shambling',
'unshunned',
'incompetency',
'Gospel',
'provincialisms',
'these',
'early']
In [35]:
fdist1['whale']
Out[35]:
906
In [36]:
fdist1.plot(50, cumulative=True)
In [37]:
fdist1.hapaxes()
Out[37]:
['Pythagoras',
'vanquished',
'135',
'fickleness',
'Republican',
'bunting',
'sued',
'dispel',
'
'predecessors',
'chimed',
'slopingly',
'properties',
'21',
'quoggy',
'retrace',
'Silent',
'unbegun',
'intestines',
'exploded',
'shelves',
'impudence',
'astronomical',
'BLACKSMITH',
'loosening',
'Isabella',
'patron',
'functionary',
。。。。。。。。。。。。。。。。
'disguisement',
'lieutenant',
'Way',
...]
In [43]:
V = set(text4)
long_words = [w for w in V if len(w) > 15]
sorted(long_words)
Out[43]:
['RESPONSIBILITIES',
'antiphilosophists',
'constitutionally',
'contradistinction',
'discountenancing',
'disqualification',
'enthusiastically',
'instrumentalities',
'internationality',
'irresponsibility',
'misappropriation',
'misrepresentation',
'misunderstanding',
'responsibilities',
'sentimentalizing',
'transcontinental',
'uncharitableness',
'unconstitutional']
In [ ]:
fdist5 = FreqDist(text5)
sorted([w for w in set(text5) if len(w) > 7 and fdist5[w] > 7])
In [46]:
from nltk.util import bigrams
list(bigrams(['more', 'is', 'said', 'than', 'done']))
Out[46]:
[('more', 'is'), ('is', 'said'), ('said', 'than'), ('than', 'done')]
In [47]:
text4.collocations()
text8.collocations()
United States; fellow citizens; four years; years ago; Federal
Government; General Government; American people; Vice President; Old
World; Almighty God; Fellow citizens; Chief Magistrate; Chief Justice;
God bless; every citizen; Indian tribes; public debt; one another;
foreign nations; political parties
would like; medium build; social drinker; quiet nights; non smoker;
long term; age open; Would like; easy going; financially secure; fun
times; similar interests; Age open; weekends away; poss rship; well
presented; never married; single mum; permanent relationship; slim
build
In [48]:
[len(w) for w in text1]
Out[48]:
[1,
4,
4,
2,
6,
8,
4,
1,
。。。。。。。。。。。。。。。。。。。。。。
1,
4,
5,
5,
4,
1,
2,
...]
In [49]:
fdist = FreqDist([len(w) for w in text1])
fdist
Out[49]:
FreqDist({1: 47933,
2: 38513,
3: 50223,
4: 42345,
5: 26597,
6: 17111,
7: 14399,
8: 9966,
9: 6428,
10: 3528,
11: 1873,
12: 1053,
13: 567,
14: 177,
15: 70,
16: 22,
17: 12,
18: 1,
20: 1})
In [50]:
fdist.keys()
Out[50]:
dict_keys([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20])
In [51]:
fdist.items()
Out[51]:
dict_items([(1, 47933), (2, 38513), (3, 50223), (4, 42345), (5, 26597), (6, 17111), (7, 14399), (8, 9966), (9, 6428), (10, 3528), (11, 1873), (12, 1053), (13, 567), (14, 177), (15, 70), (16, 22), (17, 12), (18, 1), (20, 1)])
In [52]:
fdist.max()
Out[52]:
3