import numpy as np import nltk from nltk.corpus import brown # Exercise 1 def my_frequency(j): d={} for it in j: if it not in d: d[it]=1 else: d[it]+=1 return d # Exercise 2 def my_median(l): s = sorted(l) n = len(s) if n % 2 == 0: r = (s[int(n/2-1)]+s[int(n/2)])/2 else: r = s[int((n-1)/2)] return r def my_median_2(li): sor = sorted(li) n = len(li) return (sor[(n-1)//2]+sor[n//2])/2 def my_mean(l): return sum(l)/len(l) # Exerccise 3 # How to handle lower case? labels = ['i', 'we', 'he', 'she', 'they'] brown_freq=nltk.FreqDist( [w.lower() for w in brown.words() if w.lower() in set(labels)]) # To replace 'i' with 'I' as key: v=brown_freq.pop('i') brown_freq['I']=v brown_freq.tabulate() brown_freq.plot() # Exercise 4 # the following does not do lower case correctly cond = nltk.ConditionalFreqDist([(genre,word) for genre in ['news','fiction'] for word in brown.words(categories=genre) if word in set(labels)]) # to get lower case: cf_low = nltk.ConditionalFreqDist([(genre,word.lower()) for genre in ['news','fiction'] for word in brown.words(categories=genre) if word.lower() in set(labels)]) # to decorate the result: cf_low['news']['I']=cf_low['news'].pop('i') cf_low['fiction']['I']=cf_low['fiction'].pop('i') print("\nTable without case folding:") cond.tabulate() print("\nTable with case folding:") cf_low.tabulate()