Mercurial > kallithea
comparison pylons_app/lib/indexers/__init__.py @ 479:149940ba96d9 celery
fixed search chunking bug and optimized chunk size
author | Marcin Kuzminski <marcin@python-works.com> |
---|---|
date | Thu, 16 Sep 2010 15:22:10 +0200 |
parents | 7010af6efde5 |
children | fb0c3af6031b |
comparison
equal
deleted
inserted
replaced
478:7010af6efde5 | 479:149940ba96d9 |
---|---|
13 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter | 13 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter |
14 | 14 |
15 import os | 15 import os |
16 import sys | 16 import sys |
17 import traceback | 17 import traceback |
18 | |
19 | |
20 | 18 |
21 #to get the pylons_app import | 19 #to get the pylons_app import |
22 sys.path.append(dn(dn(dn(os.path.realpath(__file__))))) | 20 sys.path.append(dn(dn(dn(os.path.realpath(__file__))))) |
23 | 21 |
24 | 22 |
48 | 46 |
49 | 47 |
50 IDX_NAME = 'HG_INDEX' | 48 IDX_NAME = 'HG_INDEX' |
51 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') | 49 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') |
52 FRAGMENTER = SimpleFragmenter(200) | 50 FRAGMENTER = SimpleFragmenter(200) |
53 | |
54 | |
55 | |
56 | 51 |
57 class ResultWrapper(object): | 52 class ResultWrapper(object): |
58 def __init__(self, searcher, matcher, highlight_items): | 53 def __init__(self, searcher, matcher, highlight_items): |
59 self.searcher = searcher | 54 self.searcher = searcher |
60 self.matcher = matcher | 55 self.matcher = matcher |
61 self.highlight_items = highlight_items | 56 self.highlight_items = highlight_items |
62 self.fragment_size = 150 * 2 | 57 self.fragment_size = 200 / 2 |
63 | 58 |
64 @LazyProperty | 59 @LazyProperty |
65 def doc_ids(self): | 60 def doc_ids(self): |
66 docs_id = [] | 61 docs_id = [] |
67 while self.matcher.is_active(): | 62 while self.matcher.is_active(): |
68 docnum = self.matcher.id() | 63 docnum = self.matcher.id() |
69 docs_id.append(docnum) | 64 chunks = [offsets for offsets in self.get_chunks()] |
65 docs_id.append([docnum, chunks]) | |
70 self.matcher.next() | 66 self.matcher.next() |
71 return docs_id | 67 return docs_id |
72 | 68 |
73 def __str__(self): | 69 def __str__(self): |
74 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids)) | 70 return '<%s at %s>' % (self.__class__.__name__, len(self.doc_ids)) |
97 slice.append(self.get_full_content(docid)) | 93 slice.append(self.get_full_content(docid)) |
98 return slice | 94 return slice |
99 | 95 |
100 | 96 |
101 def get_full_content(self, docid): | 97 def get_full_content(self, docid): |
102 res = self.searcher.stored_fields(docid) | 98 res = self.searcher.stored_fields(docid[0]) |
103 f_path = res['path'][res['path'].find(res['repository']) \ | 99 f_path = res['path'][res['path'].find(res['repository']) \ |
104 + len(res['repository']):].lstrip('/') | 100 + len(res['repository']):].lstrip('/') |
105 | 101 |
106 content_short = ''.join(self.get_short_content(res)) | 102 content_short = self.get_short_content(res, docid[1]) |
107 res.update({'content_short':content_short, | 103 res.update({'content_short':content_short, |
108 'content_short_hl':self.highlight(content_short), | 104 'content_short_hl':self.highlight(content_short), |
109 'f_path':f_path}) | 105 'f_path':f_path}) |
110 | 106 |
111 return res | 107 return res |
112 | 108 |
113 def get_short_content(self, res): | 109 def get_short_content(self, res, chunks): |
110 | |
111 return ''.join([res['content'][chunk[0]:chunk[1]] for chunk in chunks]) | |
112 | |
113 def get_chunks(self): | |
114 """ | 114 """ |
115 Smart function that implements chunking the content | 115 Smart function that implements chunking the content |
116 but not overlap chunks so it doesn't highlight the same | 116 but not overlap chunks so it doesn't highlight the same |
117 close occurences twice. | 117 close occurences twice. |
118 @param matcher: | 118 @param matcher: |
122 for span in self.matcher.spans(): | 122 for span in self.matcher.spans(): |
123 start = span.startchar or 0 | 123 start = span.startchar or 0 |
124 end = span.endchar or 0 | 124 end = span.endchar or 0 |
125 start_offseted = max(0, start - self.fragment_size) | 125 start_offseted = max(0, start - self.fragment_size) |
126 end_offseted = end + self.fragment_size | 126 end_offseted = end + self.fragment_size |
127 print start_offseted, end_offseted | 127 |
128 if start_offseted < memory[-1][1]: | 128 if start_offseted < memory[-1][1]: |
129 start_offseted = memory[-1][1] | 129 start_offseted = memory[-1][1] |
130 memory.append((start_offseted, end_offseted,)) | 130 memory.append((start_offseted, end_offseted,)) |
131 yield res["content"][start_offseted:end_offseted] | 131 yield (start_offseted, end_offseted,) |
132 | 132 |
133 def highlight(self, content, top=5): | 133 def highlight(self, content, top=5): |
134 hl = highlight(escape(content), | 134 hl = highlight(escape(content), |
135 self.highlight_items, | 135 self.highlight_items, |
136 analyzer=ANALYZER, | 136 analyzer=ANALYZER, |