def index_dir(self, base_path):
num_files_indexed = 0
allfiles = os.listdir(base_path)
self._documents = os.listdir(base_path)
num_files_indexed = len(allfiles)
docnumber = 0
self._inverted_index = collections.defaultdict(list)
docnumlist = []
for file in allfiles:
self.documents = [base_path+file] #list of all text files
f = open(base_path+file, 'r')
lines = f.read()
tokens = self.tokenize(lines)
docnumber = docnumber + 1
for term in tokens:
# check if the key/term already exists in the dictionary,
# if yes, just add a new key value/term into the dict
if term not in sorted(self._inverted_index.keys()):
newlist=[]
tf=1
self._inverted_index[term] = []
#self._inverted_index[term][docnumber] +=1
newlist.append(docnumber)
newlist.append(tf)
self._inverted_index[term].append(newlist) #appending list to a list
else:
if docnumber not in self._inverted_index.get(term):
newlist=[]
tf=1
newlist.append(docnumber)
newlist.append(tf)
self._inverted_index[term].append(newlist)
f.close()
print '\n \n'
print 'Dictionary contents: \n'
for term in sorted(self._inverted_index):
print term, '->', self._inverted_index.get(term)
return num_files_indexed
return 0
What I get from this code:
dictionary in this format:
term <- [[docnumber, term freq][docnumber, term freq]]
for ex: if the word cat occurs in doc 1.txt for three times and in Doc 3.txt twice:
I get:
cat <- [[1,1],[1,1],[1,1],[3,1][3,1]]
so, instead of getting [1,1] three times, I want [1,3] added to the list
I don't know how to get rid of repetitive members of the list and increment the term freq.
What I should get:
cat <- [[1,3],[3,2]] i.e. thrice in Doc 1 and twice in doc 3.
I have tried ways to work it out, but I get access errors all the time.
Thanks in advance.