Changeset 192
- Timestamp:
- 08/17/08 16:28:29 (3 months ago)
- Files:
-
- trunk/shakespeare/controllers/search.py (modified) (1 diff)
- trunk/shakespeare/search.py (modified) (1 diff)
- trunk/shakespeare/stats.py (modified) (1 diff)
- trunk/shakespeare/templates/search/index.html (modified) (1 diff)
- trunk/shakespeare/tests/functional/test_search.py (modified) (1 diff)
- trunk/shakespeare/tests/test_search.py (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/shakespeare/controllers/search.py
Revision 191 Revision 192 1 import logging 1 import logging 2 2 3 from shakespeare.lib.base import * 3 from shakespeare.lib.base import * 4 4 5 log = logging.getLogger(__name__) 5 log = logging.getLogger(__name__) 6 6 7 import shakespeare.search 7 import shakespeare.search 8 8 9 class SearchController(BaseController): 9 class SearchController(BaseController): 10 10 11 def index(self): 11 def index(self): 12 c.query = request.params.get('query', '') 12 c.query = request.params.get('query', '') 13 if c.query: 13 if c.query: 14 c.matches = self._get_matches(c.query)14 matches = self._get_matches(c.query) 15 c.results = self._get_results(c.matches)15 c.results = [ SearchResult.from_match(m) for m in matches ] 16 c.total = c.matches.get_matches_estimated()16 c.total = matches.get_matches_estimated() 17 else: 17 else: 18 c. matches = None18 c.total = -1 19 return render('search/index') 19 return render('search/index') 20 20 21 def _get_matches(self, query): 21 def _get_matches(self, query): 22 index = shakespeare.search.SearchIndex.default_index() 22 index = shakespeare.search.SearchIndex.default_index() 23 matches = index.search(query, numresults=50) 23 matches = index.search(query, numresults=50) 24 return matches 24 return matches 25 25 26 def _get_results(self, matches): 26 class SearchResult(object): 27 results = [] 27 def __init__(self, snippet='', text=None, lineno=None): 28 for m in matches: 28 for k,v in locals().items(): 29 text, lineno = self._match_to_text(m) 29 setattr(self, k, v) 30 if text: 30 if self.text: 31 # slight hack -- just attach direct to object 31 self.title = self.text.title 32 text._lineno = lineno 32 else: 33 text._snippet = m.document.get_data() 33 self.title = 'Unknown' 34 results.append(text) 35 else: 36 # TODO: create a dummy text ... 37 pass 38 return results 39 34 40 def _match_to_text(self, m): 35 @classmethod 36 def from_match(cls, m): 37 snippet = m.document.get_data() 41 item_id = m.document.get_value(shakespeare.search.ITEM_ID) 38 item_id = m.document.get_value(shakespeare.search.ITEM_ID) 42 text = model.Material.byName(item_id) 39 text = model.Material.byName(item_id) 43 lineno = m.document.get_value(shakespeare.search.LINE_NO) 40 lineno = m.document.get_value(shakespeare.search.LINE_NO) 44 return (text, lineno)41 return cls(snippet, text, lineno) 45 42 trunk/shakespeare/search.py
Revision 191 Revision 192 1 '''Support for indexing and searching texts using xapian. 1 '''Support for indexing and searching texts using xapian. 2 2 3 Architecture 3 Architecture 4 ============ 4 ============ 5 5 6 For information on theoretical structure of Xapain see: 6 For information on theoretical structure of Xapain see: 7 http://xapian.org/docs/intro_ir.html 7 http://xapian.org/docs/intro_ir.html 8 8 9 For basic demo python code see: http://xapian.org/docs/bindings/python/ 9 For basic demo python code see: http://xapian.org/docs/bindings/python/ 10 10 11 For helpful example of using Xapian in python (including metadata, add_post 11 For helpful example of using Xapian in python (including metadata, add_post 12 etc) see: 12 etc) see: 13 13 14 * http://www.thesamet.com/blog/2007/02/04/pumping-up-your-applications-with-xapian-full-text-search/ 14 * http://www.thesamet.com/blog/2007/02/04/pumping-up-your-applications-with-xapian-full-text-search/ 15 * http://www.rkblog.rk.edu.pl/w/p/xapian-python/ 15 * http://www.rkblog.rk.edu.pl/w/p/xapian-python/ 16 16 17 Here we discuss how we can use Xapian in OS. Two main tasks: 17 Here we discuss how we can use Xapian in OS. Two main tasks: 18 18 19 1. Do search 19 1. Do search 20 2. Produce statistics 20 2. Produce statistics 21 21 22 Second task just requires stemming support, first requires full Xapian 22 Second task just requires stemming support, first requires full Xapian 23 facilities. Main question for indexing is: 23 facilities. Main question for indexing is: 24 24 25 * What is our atomization level. I.e. what are 'documents' we index? Is it: 25 * What is our atomization level. I.e. what are 'documents' we index? Is it: 26 * A whole poem or play 26 * A whole poem or play 27 * Is it a paragraph within a work 27 * Is it a paragraph within a work 28 * Is it a character's whole speech? 28 * Is it a character's whole speech? 29 29 30 TODO: 30 TODO: 31 * add metadata (e.g. which character is speaking, work id ...) 31 * add metadata (e.g. which character is speaking, work id ...) 32 ''' 32 ''' 33 import os 33 import os 34 import re 34 import re 35 35 36 import xapian 36 import xapian 37 37 38 # keys for document values 38 # keys for document values 39 ITEM_ID = 0 39 ITEM_ID = 0 40 LINE_NO = 1 40 LINE_NO = 1 41 41 42 class SearchIndex(object): 42 class SearchIndex(object): 43 def __init__(self, index_dir): 43 def __init__(self, index_dir): 44 self.index_dir = index_dir 44 self.index_dir = index_dir 45 45 46 @classmethod 46 @classmethod 47 def config_index_dir(self): 47 def config_index_dir(self): 48 '''Get the search index directory specified in the config.''' 48 '''Get the search index directory specified in the config.''' 49 import shakespeare 49 import shakespeare 50 conf = shakespeare.conf() 50 conf = shakespeare.conf() 51 index_dir = conf['search_index_dir'] 51 index_dir = conf['search_index_dir'] 52 return index_dir 52 return index_dir 53 53 54 @classmethod 54 @classmethod 55 def default_index(self): 55 def default_index(self): 56 '''Return a SearchIndex instance initialized with the path specified in 56 '''Return a SearchIndex instance initialized with the path specified in 57 the configuration file. 57 the configuration file. 58 ''' 58 ''' 59 index_dir = self.config_index_dir() 59 index_dir = self.config_index_dir() 60 if not os.path.exists(index_dir): 60 if not os.path.exists(index_dir): 61 os.makedirs(index_dir) 61 os.makedirs(index_dir) 62 return SearchIndex(index_dir) 62 return SearchIndex(index_dir) 63 63 64 def add_item(self, fileobj, item_id=None): 64 def _make_id_term(self, item_id): 65 return 'I' + str(item_id) 66 67 def add_item(self, fileobj, item_id): 68 '''Add a text contained in fileobj and identified by item_id to the 69 Xapian search database. 70 71 Each item added is broken in paragraphs to be indexed with each 72 paragraph becoming a separate L{xapian.Document}. 73 74 Each such document has an associated id_term based on the item_id and 75 the value and lineno are stored in Xapian values keyed by ITEM_ID and 76 LINE_NO. 77 ''' 65 database = xapian.WritableDatabase(self.index_dir, xapian.DB_CREATE_OR_OPEN) 78 database = xapian.WritableDatabase(self.index_dir, xapian.DB_CREATE_OR_OPEN) 66 indexer = xapian.TermGenerator() 79 indexer = xapian.TermGenerator() 67 stemmer = xapian.Stem("english") 80 stemmer = xapian.Stem("english") 68 indexer.set_stemmer(stemmer) 81 indexer.set_stemmer(stemmer) 69 82 70 para = '' 83 para = '' 71 try: 84 try: 72 count = -1 85 count = -1 73 para_start = 0 86 para_start = 0 74 for line in fileobj: 87 for line in fileobj: 75 count += 1 88 count += 1 76 line = line.strip() 89 line = line.strip() 77 if line == '': 90 if line == '': 78 if para != '': 91 if para != '': 79 doc = xapian.Document() 92 doc = xapian.Document() 80 doc.set_data(para) 93 doc.set_data(para) 81 id_term = 'I' + str(item_id)94 id_term = self._make_id_term(item_id) 82 doc.add_term(id_term) 95 doc.add_term(id_term) 83 doc.add_value(ITEM_ID, str(item_id)) 96 doc.add_value(ITEM_ID, str(item_id)) 84 doc.add_value(LINE_NO, str(para_start)) 97 doc.add_value(LINE_NO, str(para_start)) 85 98 86 indexer.set_document(doc) 99 indexer.set_document(doc) 87 # this *will* include positional information 100 # this *will* include positional information 88 indexer.index_text(para) 101 indexer.index_text(para) 89 102 90 database.add_document(doc) 103 database.add_document(doc) 91 # assume next para starts 104 # assume next para starts 92 para = '' 105 para = '' 93 # must come after 106 # must come after 94 para_start = count 107 para_start = count 95 else: 108 else: 96 if para != '': 109 if para != '': 97 para += '\n' 110 para += '\n' 98 para += line 111 para += line 99 except StopIteration: 112 except StopIteration: 100 # TODO: what is happening here? 113 # TODO: what is happening here? 101 raise 114 raise 115 116 def remove_item(self, item_id): 117 id_term = self._make_id_term(item_id) 118 database = xapian.WritableDatabase(self.index_dir, xapian.DB_CREATE_OR_OPEN) 119 database.delete_document(id_term) 102 120 103 def get_database(self): 121 def get_database(self): 104 database = xapian.Database(self.index_dir) 122 database = xapian.Database(self.index_dir) 105 return database 123 return database 106 124 107 def search(self, query_string, offset=0, numresults=10): 125 def search(self, query_string, offset=0, numresults=10): 108 database = self.get_database() 126 database = self.get_database() 109 enquire = xapian.Enquire(database) 127 enquire = xapian.Enquire(database) 110 qp = xapian.QueryParser() 128 qp = xapian.QueryParser() 111 stemmer = xapian.Stem("english") 129 stemmer = xapian.Stem("english") 112 qp.set_stemmer(stemmer) 130 qp.set_stemmer(stemmer) 113 qp.set_database(database) 131 qp.set_database(database) 114 qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) 132 qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME) 115 query = qp.parse_query(query_string) 133 query = qp.parse_query(query_string) 116 enquire.set_query(query) 134 enquire.set_query(query) 117 matches = enquire.get_mset(offset, numresults) 135 matches = enquire.get_mset(offset, numresults) 118 return matches 136 return matches 119 137 120 def add_from_path(self, path): 138 def add_from_path(self, path): 121 '''Add contents of {path} (file itself or all text files in directory 139 '''Add contents of {path} (file itself or all text files in directory 122 if directory) to the search index.''' 140 if directory) to the search index.''' 123 path = path.strip() 141 path = path.strip() 124 if not os.path.exists(path): 142 if not os.path.exists(path): 125 print '"%s" is not an existent path' % path 143 print '"%s" is not an existent path' % path 126 return 1 144 return 1 127 if os.path.isdir(path): 145 if os.path.isdir(path): 128 fns = os.listdir(path) 146 fns = os.listdir(path) 129 fns = filter(lambda x: x.endswith('.txt'), fns) 147 fns = filter(lambda x: x.endswith('.txt'), fns) 130 works = [ os.path.join(path, fn) for fn in fns ] 148 works = [ os.path.join(path, fn) for fn in fns ] 131 else: 149 else: 132 works = [ path ] 150 works = [ path ] 133 for work in works: 151 for work in works: 134 fileobj = open(work) 152 fileobj = open(work) 135 self.add_item(fileobj) 153 self.add_item(fileobj) 136 154 137 @classmethod 155 @classmethod 138 def print_matches(self, matches): 156 def print_matches(self, matches): 139 # Display the results. 157 # Display the results. 140 msg = '%i results found.' % matches.get_matches_estimated() 158 msg = '%i results found.' % matches.get_matches_estimated() 141 msg += 'Results 1-%i:' % matches.size() 159 msg += 'Results 1-%i:' % matches.size() 142 160 143 for m in matches: 161 for m in matches: 144 msg += '\n' 162 msg += '\n' 145 msg += '%i: %i%% docid=%i' % (m.rank + 1, m.percent, m.docid) 163 msg += '%i: %i%% docid=%i' % (m.rank + 1, m.percent, m.docid) 146 msg += '\n' 164 msg += '\n' 147 msg += m.document.get_data() 165 msg += m.document.get_data() 148 msg += '\n' 166 msg += '\n' 149 return msg 167 return msg 150 168 trunk/shakespeare/stats.py
Revision 187 Revision 192 1 """ 1 """ 2 Statistics for texts. 2 Statistics for texts. 3 3 4 NB: all word keys have been lower-cased in order to render them 4 All word keys are lower-cased in order to render them case-insensitive and 5 case-insensitive 5 are stemmed using the Xapian standard English stemmer. 6 6 7 TODO 8 ==== 9 10 1. Provide for normalized statistics (that is occurences normalized by their 11 occurence in the particular text). 12 13 2. Support for aggregate statistics across multiple texts 7 """ 14 """ 8 import re 15 import re 9 import xapian 16 import xapian 10 17 11 import shakespeare.model as model 18 import shakespeare.model as model 12 19 13 class Stats(object): 20 class Stats(object): 14 21 15 @classmethod 22 @classmethod 16 def analyze(self, fileobj): 23 def analyze(self, fileobj): 17 '''Get statistics on text in fileobj. 24 '''Get statistics on text in fileobj. 18 25 19 Words are stemmed so that e.g. love and loved count as the same word. 26 Words are stemmed so that e.g. love and loved count as the same word. 20 ''' 27 ''' 21 # (?) maybe could use xapian.TermGenerator to split document 28 # (?) maybe could use xapian.TermGenerator to split document 22 WORD_RE = re.compile('\\w{1,32}', re.U) 29 WORD_RE = re.compile('\\w{1,32}', re.U) 23 stemmer = xapian.Stem('english') 30 stemmer = xapian.Stem('english') 24 results = {} 31 results = {} 25 text = fileobj.read() 32 text = fileobj.read() 26 text = text.encode('utf8') 33 text = text.encode('utf8') 27 for term in WORD_RE.finditer(text): 34 for term in WORD_RE.finditer(text): 28 word = term.group() 35 word = term.group() 29 word = word.lower() 36 word = word.lower() 30 stemmed_word = stemmer(word) 37 stemmed_word = stemmer(word) 31 results[stemmed_word] = results.get(stemmed_word, 0) + 1 38 results[stemmed_word] = results.get(stemmed_word, 0) + 1 32 return results 39 return results 33 40 34 def statsify(self, material, fileobj): 41 def statsify(self, material, fileobj): 35 '''Create statistics associated to domain object `material` whose 42 '''Create statistics associated to domain object `material` whose 36 content is in `fileobj`. 43 content is in `fileobj`. 37 ''' 44 ''' 38 stats = self.analyze(fileobj) 45 stats = self.analyze(fileobj) 39 for k in stats: 46 for k in stats: 40 model.Statistic(text=material, 47 model.Statistic(text=material, 41 word=k, 48 word=k, 42 freq=stats[k] 49 freq=stats[k] 43 ) 50 ) 44 model.Session.flush() 51 model.Session.flush() 45 52 46 def freq(self, text, word): 53 def freq(self, text, word): 47 stat = model.Statistic.query.filter_by( 54 stat = model.Statistic.query.filter_by( 48 text=text).filter_by(word=word).first() 55 text=text).filter_by(word=word).first() 49 if stat: 56 if stat: 50 return stat.freq 57 return stat.freq 51 else: 58 else: 52 return 0 59 return 0 53 60 54 def text_stats(self, text): 61 def text_stats(self, text): 55 '''Return word statistics for text, most popular word first.''' 62 '''Return word statistics for text, most popular word first.''' 56 stats = model.Statistic.query.order_by(model.Statistic.freq.desc()).all() 63 stats = model.Statistic.query.order_by(model.Statistic.freq.desc()).all() 57 return stats 64 return stats 58 65 trunk/shakespeare/templates/search/index.html
Revision 191 Revision 192 1 <html xmlns:py="http://genshi.edgewall.org/" 1 <html xmlns:py="http://genshi.edgewall.org/" 2 xmlns:xi="http://www.w3.org/2001/XInclude"> 2 xmlns:xi="http://www.w3.org/2001/XInclude"> 3 3 4 <py:def function="page_title">Search Shakespeare's Work</py:def> 4 <py:def function="page_title">Search Shakespeare's Work</py:def> 5 5 6 <div py:match="content"> 6 <div py:match="content"> 7 <form name="test" method="GET" action=""> 7 <form name="test" method="GET" action=""> 8 <input type="text" name="query" /> 8 <input type="text" name="query" /> 9 <input type="submit" name="submit" value="Submit" /> 9 <input type="submit" name="submit" value="Submit" /> 10 </form> 10 </form> 11 11 12 <div class="search-results" py:if="c. matches is not None">12 <div class="search-results" py:if="c.total >= 0"> 13 <h3>Search Results For: ${c.query}</h3> 13 <h3>Search Results For: ${c.query}</h3> 14 There were ${c.total} results. 14 There were ${c.total} results. 15 <ul> 15 <ul> 16 <li py:for="m in c.results"> 16 <li py:for="m in c.results"> 17 Work: ${m.title}, Line: ${m. _lineno}17 Work: ${m.title}, Line: ${m.lineno} 18 <blockquote> 18 <blockquote> 19 <pre>${m. _snippet}</pre>19 <pre>${m.snippet}</pre> 20 </blockquote> 20 </blockquote> 21 </li> 21 </li> 22 </ul> 22 </ul> 23 </div> 23 </div> 24 </div> 24 </div> 25 25 26 <xi:include href="../layout.html" /> 26 <xi:include href="../layout.html" /> 27 </html> 27 </html> trunk/shakespeare/tests/functional/test_search.py
Revision 191 Revision 192 1 import StringIO 1 import StringIO 2 2 3 from shakespeare.tests import * 3 from shakespeare.tests import * 4 4 5 import shakespeare.search 5 import shakespeare.search 6 6 7 class TestSearchController(TestController): 7 class TestSearchController(TestController): 8 8 9 def setUp(self): 9 @classmethod 10 # TODO: remove this item from index in tearDown 10 def setup_class(self): 11 text = make_fixture() 11 self.text = make_fixture() 12 sindex = shakespeare.search.SearchIndex.default_index() 12 self.sindex = shakespeare.search.SearchIndex.default_index() 13 sindex.add_item(StringIO.StringIO(text.content), text.name) 13 self.sindex.add_item(StringIO.StringIO(self.text.content), self.text.name) 14 15 @classmethod 16 def teardown_class(self): 17 self.sindex.remove_item(self.text.name) 14 18 15 def test_index(self): 19 def test_index(self): 16 url = url_for(controller='search') 20 url = url_for(controller='search') 17 res = self.app.get(url) 21 res = self.app.get(url) 18 assert "Search" in res 22 assert "Search" in res 19 23 20 def test_search(self): 24 def test_search(self): 21 url = url_for(controller='search') 25 url = url_for(controller='search') 22 res = self.app.get(url) 26 res = self.app.get(url) 23 form = res.forms[0] 27 form = res.forms[0] 24 form['query'] = 'summer' 28 form['query'] = 'summer' 25 res = form.submit() 29 res = form.submit() 26 # printres30 assert 'There were 2 results' in res 27 assert 'Search Results' in res 31 assert 'Search Results' in res 32 assert 'Sonnet 18' in res 28 assert 'Shall I compare thee' in res 33 assert 'Shall I compare thee' in res 29 34 trunk/shakespeare/tests/test_search.py
Revision 191 Revision 192 1 import os 1 import os 2 import shutil 2 import shutil 3 import tempfile 3 import tempfile 4 import StringIO 4 import StringIO 5 5 6 import shakespeare.search 6 import shakespeare.search 7 import shakespeare.tests 7 import shakespeare.tests 8 8 9 class TestSearch: 9 class TestSearch: 10 def setUp(self): 10 def setUp(self): 11 self.text = shakespeare.tests.make_fixture() 11 self.text = shakespeare.tests.make_fixture() 12 basetmp = tempfile.gettempdir() 12 basetmp = tempfile.gettempdir() 13 self.tmpdir = os.path.join(basetmp, 'openshkspr-search') 13 self.tmpdir = os.path.join(basetmp, 'openshkspr-search') 14 # we leave directory in existence to help with debugging 14 # we leave directory in existence to help with debugging 15 if os.path.exists(self.tmpdir): 15 if os.path.exists(self.tmpdir): 16 shutil.rmtree(self.tmpdir) 16 shutil.rmtree(self.tmpdir) 17 os.makedirs(self.tmpdir) 17 os.makedirs(self.tmpdir) 18 self.index = shakespeare.search.SearchIndex(self.tmpdir) 18 self.index = shakespeare.search.SearchIndex(self.tmpdir) 19 self.index.add_item(StringIO.StringIO(self.text.content), 19 self.index.add_item(StringIO.StringIO(self.text.content), 20 self.text.name) 20 self.text.name) 21 21 22 # TODO: remove the document from the index 22 # TODO: remove the document from the index 23 23 24 def test_add_item(self): 24 def test_add_item(self): 25 # as 4 paras should be 4 but not certain 25 assert self.index.get_database().get_doccount() > 0 26 assert self.index.get_database().get_doccount() > 0 26 27 27 def test_search(self): 28 def test_remove_item(self): 29 self.index.remove_item(self.text.name) 30 assert self.index.get_database().get_doccount() == 0 31 32 def test_search_1(self): 28 out = self.index.search('summer') 33 out = self.index.search('summer') 29 assert len(out) == 2 34 assert len(out) == 2 35 mset1 = out[0] 36 exp = "Shall I compare thee to a summer's day" 37 assert mset1.document.get_data().startswith(exp) 38 39 def test_search_2(self): 40 out = self.index.search('summer') 30 mset1 = out[1] 41 mset1 = out[1] 31 # 'But thy eternal summer ... 42 # 'But thy eternal summer ... 32 exp = "But thy eternal summer shall not fade,\nNor lose possession of that fair thou ow'st," 43 exp = "But thy eternal summer shall not fade,\nNor lose possession of that fair thou ow'st," 33 assert mset1.document.get_data().startswith(exp) 44 assert mset1.document.get_data().startswith(exp) 45 46 def test_search_3(self): 34 out = self.index.search('rough') 47 out = self.index.search('rough') 35 assert len(out) == 1 48 assert len(out) == 1 36 49 37 def test_retrieve_lineno(self): 50 def test_retrieve_lineno(self): 38 out = self.index.search('summer') 51 out = self.index.search('summer') 39 mset1 = out[1] 52 mset1 = out[1] 40 lineno = mset1.document.get_value(shakespeare.search.LINE_NO) 53 lineno = mset1.document.get_value(shakespeare.search.LINE_NO) 41 assert lineno == '9' 54 assert lineno == '9' 42 55 43 def test_retrieve_itemid(self): 56 def test_retrieve_itemid(self): 44 out = self.index.search('summer') 57 out = self.index.search('summer') 45 mset1 = out[1] 58 mset1 = out[1] 46 name = mset1.document.get_value(shakespeare.search.ITEM_ID) 59 name = mset1.document.get_value(shakespeare.search.ITEM_ID) 47 assert name == self.text.name 60 assert name == self.text.name 48 61
