comparison rhodecode/lib/indexers/__init__.py @ 2640:5f21a9dcb09d beta

create an index for commit messages and the ability to search them and see results
author Indra Talip <indra.talip@gmail.com>
date Fri, 20 Jul 2012 12:50:56 +0200
parents 324b838250c9
children 88b0e82bcba4
comparison
equal deleted inserted replaced
2631:f597cfb492f9 2640:5f21a9dcb09d
33 33
34 from string import strip 34 from string import strip
35 from shutil import rmtree 35 from shutil import rmtree
36 36
37 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter 37 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
38 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType 38 from whoosh.fields import TEXT, ID, STORED, NUMERIC, BOOLEAN, Schema, FieldType
39 from whoosh.index import create_in, open_dir 39 from whoosh.index import create_in, open_dir
40 from whoosh.formats import Characters 40 from whoosh.formats import Characters
41 from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter 41 from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter
42 42
43 from webhelpers.html.builder import escape, literal 43 from webhelpers.html.builder import escape, literal
49 from rhodecode.config.environment import load_environment 49 from rhodecode.config.environment import load_environment
50 from rhodecode.lib.utils2 import LazyProperty 50 from rhodecode.lib.utils2 import LazyProperty
51 from rhodecode.lib.utils import BasePasterCommand, Command, add_cache,\ 51 from rhodecode.lib.utils import BasePasterCommand, Command, add_cache,\
52 load_rcextensions 52 load_rcextensions
53 53
54 log = logging.getLogger(__name__)
55
54 # CUSTOM ANALYZER wordsplit + lowercase filter 56 # CUSTOM ANALYZER wordsplit + lowercase filter
55 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() 57 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
56
57 58
58 #INDEX SCHEMA DEFINITION 59 #INDEX SCHEMA DEFINITION
59 SCHEMA = Schema( 60 SCHEMA = Schema(
60 fileid=ID(unique=True), 61 fileid=ID(unique=True),
61 owner=TEXT(), 62 owner=TEXT(),
69 70
70 IDX_NAME = 'HG_INDEX' 71 IDX_NAME = 'HG_INDEX'
71 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') 72 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
72 FRAGMENTER = ContextFragmenter(200) 73 FRAGMENTER = ContextFragmenter(200)
73 74
75 CHGSETS_SCHEMA = Schema(
76 path=ID(unique=True, stored=True),
77 revision=NUMERIC(unique=True, stored=True),
78 last=BOOLEAN(),
79 owner=TEXT(),
80 repository=ID(unique=True, stored=True),
81 author=TEXT(stored=True),
82 message=FieldType(format=Characters(), analyzer=ANALYZER,
83 scorable=True, stored=True),
84 parents=TEXT(),
85 added=TEXT(),
86 removed=TEXT(),
87 changed=TEXT(),
88 )
89
90 CHGSET_IDX_NAME = 'CHGSET_INDEX'
74 91
75 class MakeIndex(BasePasterCommand): 92 class MakeIndex(BasePasterCommand):
76 93
77 max_args = 1 94 max_args = 1
78 min_args = 1 95 min_args = 1
189 slices.append(self.get_full_content(docid)) 206 slices.append(self.get_full_content(docid))
190 return slices 207 return slices
191 208
192 def get_full_content(self, docid): 209 def get_full_content(self, docid):
193 res = self.searcher.stored_fields(docid[0]) 210 res = self.searcher.stored_fields(docid[0])
211 log.debug('result: %s' % res)
194 full_repo_path = jn(self.repo_location, res['repository']) 212 full_repo_path = jn(self.repo_location, res['repository'])
195 f_path = res['path'].split(full_repo_path)[-1] 213 f_path = res['path'].split(full_repo_path)[-1]
196 f_path = f_path.lstrip(os.sep) 214 f_path = f_path.lstrip(os.sep)
197 215 res.update({'f_path': f_path})
198 content_short = self.get_short_content(res, docid[1]) 216
199 res.update({'content_short': content_short, 217 if self.search_type == 'content':
200 'content_short_hl': self.highlight(content_short), 218 content_short = self.get_short_content(res, docid[1])
201 'f_path': f_path}) 219 res.update({'content_short': content_short,
220 'content_short_hl': self.highlight(content_short)})
221 elif self.search_type == 'message':
222 res.update({'message_hl': self.highlight(res['message'])})
223
224 log.debug('result: %s' % res)
202 225
203 return res 226 return res
204 227
205 def get_short_content(self, res, chunks): 228 def get_short_content(self, res, chunks):
206 229
214 237
215 :param matcher: 238 :param matcher:
216 :param size: 239 :param size:
217 """ 240 """
218 memory = [(0, 0)] 241 memory = [(0, 0)]
219 for span in self.matcher.spans(): 242 if self.matcher.supports('positions'):
220 start = span.startchar or 0 243 for span in self.matcher.spans():
221 end = span.endchar or 0 244 start = span.startchar or 0
222 start_offseted = max(0, start - self.fragment_size) 245 end = span.endchar or 0
223 end_offseted = end + self.fragment_size 246 start_offseted = max(0, start - self.fragment_size)
224 247 end_offseted = end + self.fragment_size
225 if start_offseted < memory[-1][1]: 248
226 start_offseted = memory[-1][1] 249 if start_offseted < memory[-1][1]:
227 memory.append((start_offseted, end_offseted,)) 250 start_offseted = memory[-1][1]
228 yield (start_offseted, end_offseted,) 251 memory.append((start_offseted, end_offseted,))
252 yield (start_offseted, end_offseted,)
229 253
230 def highlight(self, content, top=5): 254 def highlight(self, content, top=5):
231 if self.search_type != 'content': 255 if self.search_type not in ['content', 'message']:
232 return '' 256 return ''
233 hl = highlight( 257 hl = highlight(
234 text=content, 258 text=content,
235 terms=self.highlight_items, 259 terms=self.highlight_items,
236 analyzer=ANALYZER, 260 analyzer=ANALYZER,