import nltk from nltk.corpus import movie_reviews from nltk import NaiveBayesClassifier from nltk import classify # Hente alle ord fra movie_reviews, konvertere til sm? bokstaver og legge dem i en FreqDist # FreqDist = A frequency distribution for the outcomes of an experiment. A frequency distribution records the number of times each outcome of an experiment has occurred. all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) # Velge de 3000 mest frekvente ord i movie_reviews word_features = list(all_words)[:3000] # Funksjon for ? hente ut trekk til v?r Naive Bayes. def document_features(document): #tar inn et dokument document_words = set(document) # bruker sett for ? enkelt kunne hente ut alle unike ord fra dokumentet features = {} for word in word_features: #sjekke om hvert ord i word_features finnes i dokumentet features['contains({})'.format(word)] = (word in document_words) return features documents = [] for category in movie_reviews.categories(): for fileid in movie_reviews.fileids(category): documents.append((movie_reviews.words(fileid), category)) #lage en liste av (text, klasse) par som skal brukes ved trening og testing featuresets = [(document_features(d), c) for (d,c) in documents] #bruke forrige funksjon til ? generere trekk train_set, test_set = featuresets[100:], featuresets[:100] #dele dataen i train og test sets classifier = nltk.NaiveBayesClassifier.train(train_set) accuracy = classify.accuracy(classifier, test_set) #Regne ut accuracy p? v?r Naive Bayes print (accuracy) print(classifier.show_most_informative_features(5)) #Vise de fem mest relevante trekk for klassifiseringen