comparison pylons_app/lib/indexers/__init__.py @ 479:149940ba96d9 celery

fixed search chunking bug and optimized chunk size
author Marcin Kuzminski <marcin@python-works.com>
date Thu, 16 Sep 2010 15:22:10 +0200
parents 7010af6efde5
children fb0c3af6031b
comparison
equal deleted inserted replaced
478:7010af6efde5 479:149940ba96d9
13 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter 13 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
14 14
15 import os 15 import os
16 import sys 16 import sys
17 import traceback 17 import traceback
18
19
20 18
21 #to get the pylons_app import 19 #to get the pylons_app import
22 sys.path.append(dn(dn(dn(os.path.realpath(__file__))))) 20 sys.path.append(dn(dn(dn(os.path.realpath(__file__)))))
23 21
24 22
48 46
49 47
50 IDX_NAME = 'HG_INDEX' 48 IDX_NAME = 'HG_INDEX'
51 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') 49 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
52 FRAGMENTER = SimpleFragmenter(200) 50 FRAGMENTER = SimpleFragmenter(200)
53
54
55
56 51
57 class ResultWrapper(object): 52 class ResultWrapper(object):
58 def __init__(self, searcher, matcher, highlight_items): 53 def __init__(self, searcher, matcher, highlight_items):
59 self.searcher = searcher 54 self.searcher = searcher
60 self.matcher = matcher 55 self.matcher = matcher
61 self.highlight_items = highlight_items 56 self.highlight_items = highlight_items
62 self.fragment_size = 150 * 2 57 self.fragment_size = 200 / 2
63 58
64 @LazyProperty 59 @LazyProperty
65 def doc_ids(self): 60 def doc_ids(self):
66 docs_id = [] 61 docs_id = []
67 while self.matcher.is_active(): 62 while self.matcher.is_active():
68 docnum = self.matcher.id() 63 docnum = self.matcher.id()
69 docs_id.append(docnum) 64 chunks = [offsets for offsets in self.get_chunks()]
65 docs_id.append([docnum, chunks])
70 self.matcher.next() 66 self.matcher.next()
71 return docs_id 67 return docs_id
72 68
73 def __str__(self): 69 def __str__(self):
74 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids)) 70 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids))
97 slice.append(self.get_full_content(docid)) 93 slice.append(self.get_full_content(docid))
98 return slice 94 return slice
99 95
100 96
101 def get_full_content(self, docid): 97 def get_full_content(self, docid):
102 res = self.searcher.stored_fields(docid) 98 res = self.searcher.stored_fields(docid[0])
103 f_path = res['path'][res['path'].find(res['repository']) \ 99 f_path = res['path'][res['path'].find(res['repository']) \
104 + len(res['repository']):].lstrip('/') 100 + len(res['repository']):].lstrip('/')
105 101
106 content_short = ''.join(self.get_short_content(res)) 102 content_short = self.get_short_content(res, docid[1])
107 res.update({'content_short':content_short, 103 res.update({'content_short':content_short,
108 'content_short_hl':self.highlight(content_short), 104 'content_short_hl':self.highlight(content_short),
109 'f_path':f_path}) 105 'f_path':f_path})
110 106
111 return res 107 return res
112 108
113 def get_short_content(self, res): 109 def get_short_content(self, res, chunks):
110
111 return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks])
112
113 def get_chunks(self):
114 """ 114 """
115 Smart function that implements chunking the content 115 Smart function that implements chunking the content
116 but not overlap chunks so it doesn't highlight the same 116 but not overlap chunks so it doesn't highlight the same
117 close occurences twice. 117 close occurences twice.
118 @param matcher: 118 @param matcher:
122 for span in self.matcher.spans(): 122 for span in self.matcher.spans():
123 start = span.startchar or 0 123 start = span.startchar or 0
124 end = span.endchar or 0 124 end = span.endchar or 0
125 start_offseted = max(0, start - self.fragment_size) 125 start_offseted = max(0, start - self.fragment_size)
126 end_offseted = end + self.fragment_size 126 end_offseted = end + self.fragment_size
127 print start_offseted, end_offseted 127
128 if start_offseted < memory[-1][1]: 128 if start_offseted < memory[-1][1]:
129 start_offseted = memory[-1][1] 129 start_offseted = memory[-1][1]
130 memory.append((start_offseted, end_offseted,)) 130 memory.append((start_offseted, end_offseted,))
131 yield res["content"][start_offseted:end_offseted] 131 yield (start_offseted, end_offseted,)
132 132
133 def highlight(self, content, top=5): 133 def highlight(self, content, top=5):
134 hl = highlight(escape(content), 134 hl = highlight(escape(content),
135 self.highlight_items, 135 self.highlight_items,
136 analyzer=ANALYZER, 136 analyzer=ANALYZER,