1+ import sqlite3
2+ import test_data
3+ import ast
4+ import json
5+
6+ class SearchEngine :
7+ """
8+ It works by building a reverse index store that maps
9+ words to an id. To find the document(s) that contain
10+ a certain search term, we then take an intersection
11+ of the ids
12+ """
13+
14+ def __init__ (self ):
15+ """
16+ Returns - None
17+ Input - None
18+ ----------
19+ - Initialize database. we use sqlite3
20+ - Check if the tables exist, if not create them
21+ - maintain a class level access to the database
22+ connection object
23+ """
24+ self .conn = sqlite3 .connect ("searchengine.sqlite3" , autocommit = True )
25+ cur = self .conn .cursor ()
26+ res = cur .execute ("SELECT name FROM sqlite_master WHERE name='IdToDoc'" )
27+ tables_exist = res .fetchone ()
28+
29+ if not tables_exist :
30+ self .conn .execute ("CREATE TABLE IdToDoc(id INTEGER PRIMARY KEY, document TEXT)" )
31+ self .conn .execute ('CREATE TABLE WordToId (name TEXT, value TEXT)' )
32+ cur .execute ("INSERT INTO WordToId VALUES (?, ?)" , ("index" , "{}" ,))
33+
34+ def index_document (self , document ):
35+ """
36+ Returns - string
37+ Input - str: a string of words called document
38+ ----------
39+ Indexes the document. It does this by performing two
40+ operations - add the document to the IdToDoc, then
41+ adds the words in the document to WordToId
42+ - takes in the document (str)
43+ - passes the document to a method to add the document
44+ to IdToDoc
45+ - retrieves the id of the inserted document
46+ - uses the id to call the method that adds the words of
47+ the document to the reverse index WordToId if the word has not
48+ already been indexed
49+ """
50+ row_id = self ._add_to_IdToDoc (document )
51+ cur = self .conn .cursor ()
52+ reverse_idx = cur .execute ("SELECT value FROM WordToId WHERE name='index'" ).fetchone ()[0 ]
53+ reverse_idx = json .loads (reverse_idx )
54+ document = document .split ()
55+ for word in document :
56+ if word not in reverse_idx :
57+ reverse_idx [word ] = [row_id ]
58+ else :
59+ if row_id not in reverse_idx [word ]:
60+ reverse_idx [word ].append (row_id )
61+ reverse_idx = json .dumps (reverse_idx )
62+ cur = self .conn .cursor ()
63+ result = cur .execute ("UPDATE WordToId SET value = (?) WHERE name='index'" , (reverse_idx ,))
64+ return ("index successful" )
65+
66+ def _add_to_IdToDoc (self , document ):
67+ """
68+ Returns - int: the id of the inserted document
69+ Input - str: a string of words called `document`
70+ ---------
71+ - use the class-level connection object to insert the document
72+ into the db
73+ - retrieve and return the row id of the inserted document
74+ """
75+ cur = self .conn .cursor ()
76+ res = cur .execute ("INSERT INTO IdToDoc (document) VALUES (?)" , (document ,))
77+ return res .lastrowid
78+
79+ def find_documents (self , search_term ):
80+ """
81+ Returns - <class method>: the return value of the _find_documents_with_idx method
82+ Input - str: a string of words called `search_term`
83+ ---------
84+ - retrieve the reverse index
85+ - use the words contained in the search term to find all the idxs
86+ that contain the word
87+ - use idxs to call the _find_documents_with_idx method
88+ - return the result of the called method
89+ """
90+ cur = self .conn .cursor ()
91+ reverse_idx = cur .execute ("SELECT value FROM WordToId WHERE name='index'" ).fetchone ()[0 ]
92+ reverse_idx = json .loads (reverse_idx )
93+ search_term = search_term .split (" " )
94+ all_docs_with_search_term = []
95+ for term in search_term :
96+ if term in reverse_idx :
97+ all_docs_with_search_term .append (reverse_idx [term ])
98+
99+ if not all_docs_with_search_term : # the search term does not exist
100+ return []
101+
102+ common_idx_of_docs = set (all_docs_with_search_term [0 ])
103+ for idx in all_docs_with_search_term [1 :]:
104+ common_idx_of_docs .intersection_update (idx )
105+
106+ if not common_idx_of_docs : # the search term does not exist
107+ return []
108+
109+ return self ._find_documents_with_idx (common_idx_of_docs )
110+
111+ def _find_documents_with_idx (self , idxs ):
112+ """
113+ Returns - list[str]: the list of documents with the idxs
114+ Input - list of idxs
115+ ---------
116+ - use the class-level connection object to retrieve the documents that
117+ have the idx in the input list of idxs.
118+ - retrieve and return these documents as a list
119+ """
120+ idxs = list (idxs )
121+ cur = self .conn .cursor ()
122+ sql = "SELECT document FROM IdToDoc WHERE id in ({seq})" .format (
123+ seq = ',' .join (['?' ]* len (idxs ))
124+ )
125+ result = cur .execute (sql , idxs ).fetchall ()
126+ return (result )
127+
128+
129+ if __name__ == "__main__" :
130+ se = SearchEngine ()
131+ se .index_document ("we should all strive to be happy and happy again" )
132+ print (se .index_document ("happiness is all you need" ))
133+ se .index_document ("no way should we be sad" )
134+ se .index_document ("a cheerful heart is a happy one even in Nigeria" )
135+ print (se .find_documents ("happy" ))
0 commit comments