pysearch/query.py at master · iamtorchstack/pysearch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Simple extended boolean search engine: query module
# Hussein Suleman
#thesarus added by Tochukwu Oba
# 14 April 2016

import re
import math
import sys
import os
import porter
import parameters

# check parameter for collection name
if len(sys.argv)<3: #truth value of length of arguemnt less than 3
   print ("Syntax: query.py <collection> <query>") #print htis syntax
   exit(0) # then exit the code

# construct collection and query
collection = sys.argv[1] #pick the first argument
query = '' #initializing string query
arg_index = 2 #intializing arg index to zero
while arg_index < len(sys.argv): #arg in
   query += sys.argv[arg_index] + ' ' #populate the query
   arg_index += 1 #increase the count

# clean query
if parameters.case_folding: #if parameter.casefolding is true
   query = query.lower () #convert all the query to lower case
query = re.sub (r'[^ a-zA-Z0-9]', ' ', query) #
query = re.sub (r'\s+', ' ', query)
query_words = query.split (' ')

print (query_words)
print(type(query_words))

# create accumulators and other data structures
accum = {}
filenames = []
p = porter.PorterStemmer ()


print (query_words)
# get N
f = open (collection+"_index_N", "r")
N = eval (f.read ())
f.close ()

# get document lengths/titles
titles = {}
f = open (collection+"_index_len", "r")
lengths = f.readlines ()
f.close ()

# get index for each term and calculate similarities using accumulators
for term in query_words:
    if term != '':
        if parameters.stemming:
            term = p.stem (term, 0, len(term)-1)
        if not os.path.isfile (collection+"_index/"+term):
           continue
        f = open (collection+"_index/"+term, "r")
        lines = f.readlines ()
        idf = 1
        if parameters.use_idf:
           df = len(lines)
           idf = 1/df
           if parameters.log_idf:
              idf = math.log (1 + N/df)

        #create the theserus here ======================================================

        for line in lines:
            mo = re.match (r'([0-9]+)\:([0-9\.]+)', line)
            if mo:
                file_id = mo.group(1)
                tf = float (mo.group(2))
                if not file_id in accum:
                    accum[file_id] = 0
                if parameters.log_tf:
                    tf = (1 + math.log (tf))
                accum[file_id] += (tf * idf)
        f.close()

# parse lengths data and divide by |N| and get titles
for l in lengths:
   mo = re.match (r'([0-9]+)\:([0-9\.]+)\:(.+)', l)
   if mo:
      document_id = mo.group (1)
      length = eval (mo.group (2))
      title = mo.group (3)
      if document_id in accum:
         if parameters.normalization:
            accum[document_id] = accum[document_id] / length
         titles[document_id] = title

# print top ten results
result = sorted (accum, key=accum.__getitem__, reverse=True)
for i in range (min (len (result), 10)):
   print ("{0:10.8f} {1:5} {2}".format (accum[result[i]], result[i], titles[result[i]]))