-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathquery.py
More file actions
99 lines (85 loc) · 2.9 KB
/
query.py
File metadata and controls
99 lines (85 loc) · 2.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Simple extended boolean search engine: query module
# Hussein Suleman
#thesarus added by Tochukwu Oba
# 14 April 2016
import re
import math
import sys
import os
import porter
import parameters
# check parameter for collection name
if len(sys.argv)<3: #truth value of length of arguemnt less than 3
print ("Syntax: query.py <collection> <query>") #print htis syntax
exit(0) # then exit the code
# construct collection and query
collection = sys.argv[1] #pick the first argument
query = '' #initializing string query
arg_index = 2 #intializing arg index to zero
while arg_index < len(sys.argv): #arg in
query += sys.argv[arg_index] + ' ' #populate the query
arg_index += 1 #increase the count
# clean query
if parameters.case_folding: #if parameter.casefolding is true
query = query.lower () #convert all the query to lower case
query = re.sub (r'[^ a-zA-Z0-9]', ' ', query) #
query = re.sub (r'\s+', ' ', query)
query_words = query.split (' ')
print (query_words)
print(type(query_words))
# create accumulators and other data structures
accum = {}
filenames = []
p = porter.PorterStemmer ()
print (query_words)
# get N
f = open (collection+"_index_N", "r")
N = eval (f.read ())
f.close ()
# get document lengths/titles
titles = {}
f = open (collection+"_index_len", "r")
lengths = f.readlines ()
f.close ()
# get index for each term and calculate similarities using accumulators
for term in query_words:
if term != '':
if parameters.stemming:
term = p.stem (term, 0, len(term)-1)
if not os.path.isfile (collection+"_index/"+term):
continue
f = open (collection+"_index/"+term, "r")
lines = f.readlines ()
idf = 1
if parameters.use_idf:
df = len(lines)
idf = 1/df
if parameters.log_idf:
idf = math.log (1 + N/df)
#create the theserus here ======================================================
for line in lines:
mo = re.match (r'([0-9]+)\:([0-9\.]+)', line)
if mo:
file_id = mo.group(1)
tf = float (mo.group(2))
if not file_id in accum:
accum[file_id] = 0
if parameters.log_tf:
tf = (1 + math.log (tf))
accum[file_id] += (tf * idf)
f.close()
# parse lengths data and divide by |N| and get titles
for l in lengths:
mo = re.match (r'([0-9]+)\:([0-9\.]+)\:(.+)', l)
if mo:
document_id = mo.group (1)
length = eval (mo.group (2))
title = mo.group (3)
if document_id in accum:
if parameters.normalization:
accum[document_id] = accum[document_id] / length
titles[document_id] = title
# print top ten results
result = sorted (accum, key=accum.__getitem__, reverse=True)
for i in range (min (len (result), 10)):
print ("{0:10.8f} {1:5} {2}".format (accum[result[i]], result[i], titles[result[i]]))