-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIndexer.html
56 lines (52 loc) · 4.76 KB
/
Indexer.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<html><head><title>Python: module Indexer</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head><body bgcolor="#f0f0f8">
<table width="100%" cellspacing=0 cellpadding=2 border=0 summary="heading">
<tr bgcolor="#7799ee">
<td valign=bottom> <br>
<font color="#ffffff" face="helvetica, arial"> <br><big><big><strong>Indexer</strong></big></big></font></td
><td align=right valign=bottom
><font color="#ffffff" face="helvetica, arial"><a href=".">index</a><br><a href="file:z%3A%5Cassignment1%5Cprograms%5Ctesting2%5Cindexer.py">z:\assignment1\programs\testing2\indexer.py</a></font></td></tr></table>
<p><tt>Traverses through the docs in a directory,<br>
creating the idf and the tf tables,<br>
and writes them to files</tt></p>
<p>
<table width="100%" cellspacing=0 cellpadding=2 border=0 summary="section">
<tr bgcolor="#aa55cc">
<td colspan=3 valign=bottom> <br>
<font color="#ffffff" face="helvetica, arial"><big><strong>Modules</strong></big></font></td></tr>
<tr><td bgcolor="#aa55cc"><tt> </tt></td><td> </td>
<td width="100%"><table width="100%" summary="list"><tr><td width="25%" valign=top><a href="os.html">os</a><br>
</td><td width="25%" valign=top><a href="pickle.html">pickle</a><br>
</td><td width="25%" valign=top><a href="re.html">re</a><br>
</td><td width="25%" valign=top><a href="time.html">time</a><br>
</td></tr></table></td></tr></table><p>
<table width="100%" cellspacing=0 cellpadding=2 border=0 summary="section">
<tr bgcolor="#eeaa77">
<td colspan=3 valign=bottom> <br>
<font color="#ffffff" face="helvetica, arial"><big><strong>Functions</strong></big></font></td></tr>
<tr><td bgcolor="#eeaa77"><tt> </tt></td><td> </td>
<td width="100%"><dl><dt><a name="-addTotf"><strong>addTotf</strong></a>(tf, TFhelper, docID)</dt><dd><tt>Adds tokens of each doc to the tf table</tt></dd></dl>
<dl><dt><a name="-cleantokensofeachdoc"><strong>cleantokensofeachdoc</strong></a>(tf, porter, tokensindoc, sw, docID)</dt><dd><tt>Cleans a list of tokens - stemming, removing repetition,<br>
and adds the cleaned tokens to the tf table</tt></dd></dl>
<dl><dt><a name="-createIDF"><strong>createIDF</strong></a>(postingslist)</dt><dd><tt>Creates the idf table from the postingslist</tt></dd></dl>
<dl><dt><a name="-getStopwords"><strong>getStopwords</strong></a>(stopwordsFile)</dt><dd><tt>get stopwords from the stopwords file</tt></dd></dl>
<dl><dt><a name="-insertinpostingslist"><strong>insertinpostingslist</strong></a>(tokensindoc, docID, postingslist)</dt><dd><tt>Inserts all the given tokens of each doc into the postings list,<br>
in a suitable format, for later retrieval</tt></dd></dl>
<dl><dt><a name="-main"><strong>main</strong></a>()</dt><dd><tt>main function, calls other functions</tt></dd></dl>
<dl><dt><a name="-tokenizedoc"><strong>tokenizedoc</strong></a>(tf, porter, current_file, sw, docID)</dt><dd><tt>Returns all the cleaned tokens within the given doc</tt></dd></dl>
<dl><dt><a name="-traversethroughdocs"><strong>traversethroughdocs</strong></a>(tf, porter, sw, postingslist)</dt><dd><tt>Traverses through all the docs in the directory indexing them</tt></dd></dl>
<dl><dt><a name="-writeToFile"><strong>writeToFile</strong></a>(filename, index)</dt><dd><tt>Writes the given index to a file, as an object,<br>
using the pickle module</tt></dd></dl>
<dl><dt><a name="-writeToHumanReadableFile"><strong>writeToHumanReadableFile</strong></a>(filename, index)</dt><dd><tt>Writes the given index to a file,<br>
in a suitable format for human reference,<br>
unlike an object which cannot be read</tt></dd></dl>
</td></tr></table><p>
<table width="100%" cellspacing=0 cellpadding=2 border=0 summary="section">
<tr bgcolor="#55aa55">
<td colspan=3 valign=bottom> <br>
<font color="#ffffff" face="helvetica, arial"><big><strong>Data</strong></big></font></td></tr>
<tr><td bgcolor="#55aa55"><tt> </tt></td><td> </td>
<td width="100%"><strong>direc</strong> = r'Z:\Assignment1\Programs\ExtractedText'</td></tr></table>
</body></html>