remise du devoir 4
coding utf-8
faire afficher les 50 paires de mots qui se trouvent le plus souvent dans les chronique de Richard Martineau.
import csv
from collections import Counter
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
analyse = "martino.csv"
c = open(analyse)
manipulations = csv.reader(c)
next(manipulations)
for inter in manipulations
tokens = word_tokenize(inter[50])
fr = SnowballStemmer('french') racines = [fr.stem(mot)for mot in word_tokenize(inter[50])] print(racines)
tokens = [mot for mot in word_tokenize(inter[50])if mot not in stopwords.words('french')]
print(tokens)
tokens = [mot for mot in word_tokenize(inter[50])if mot not in stopwords.words('french') and mot not in string.punctuation]
print(tokens)
mots = [fr.stem(mot)for mot in word_tokenize(inter[50])if mot not in stopwords.words('french') and mot not in string.punctuation]
print(mots)
for mot in mots: "islam".append(mot)
freq = Counter("islam")
freq = Counter("islam") print(freq.most_common(50)) print(len("islam"))
for mot in mots: "musulm".append(mot)
freq = Counter("musulm")
freq = Counter("musulm") print(freq.most_common(50)) print(len("musulm"))