0
Problem:
I need to create a Dictionary of inverted indexes where I should tokenize the string, remove the stopwords and extract the word root (stemmer) all using the nltk package. The output of the program must be a dictionary of dictionaries containing the word radicals extracted along with the amounts of times they appear in the sentence.
Doubt:
The stemmer I perform on the sentence returns me only the only appearance of the tokenized word, however I need it to appear every time it is in the sentence.
Example of the current output (need to show 'cas' every time the word 'home' is found):
Another issue:
How to concatenate all dictionaries into a single?
Exit from the program:
Code:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
def readBaseFile(fileName):
with open(fileName, 'r') as fileRead:
lines = fileRead.read().splitlines()
fileRead.close()
return lines
def countFrequency(my_list, fileNumber):
freq = {}
for key in my_list:
if (key not in freq):
freq[key] = {fileNumber:1}
else:
freq[key] = {fileNumber:freq[key]+1}
print(freq)
return freq
def readLinesInSubfile(linesInBaseFile, fileNumber):
tokenizeWord = tokenizer.tokenize(linesInBaseFile[0])
print(tokenizeWord)
tokenizeWord = {stemmer.stem(word) for word in tokenizeWord if not word in stopwords}
print(tokenizeWord)
return countFrequency(tokenizeWord, fileNumber)
def subfilesOfBaseFile(linesInBaseFile, fileNumber):
linesInSubfiles = {}
for files in range(len(linesInBaseFile)):
linesInSubfiles = readBaseFile('base1/'+linesInBaseFile[files])
dictionary = readLinesInSubfile(linesInSubfiles, fileNumber)
fileNumber = fileNumber + 1
return dictionary
stopwords = nltk.corpus.stopwords.words("portuguese")
stemmer = nltk.stem.RSLPStemmer()
tokenizer = nltk.RegexpTokenizer(r"\w+")
def main():
fileToRead = "base1/base.txt"
fileNumber = 1
linesInBaseFile = readBaseFile(fileToRead)
print(linesInBaseFile)
subfilesOfBaseFile(linesInBaseFile, fileNumber)
if __name__ == "__main__":
main()