comparison rhodecode/lib/indexers/__init__.py @ 2031:82a88013a3fd

merge 1.3 into stable
author Marcin Kuzminski <marcin@python-works.com>
date Sun, 26 Feb 2012 17:25:09 +0200
parents 752b0a7b7679 b6c902d88472
children dc2584ba5fbc
comparison
equal deleted inserted replaced
2005:ab0e122b38a7 2031:82a88013a3fd
5 5
6 Whoosh indexing module for RhodeCode 6 Whoosh indexing module for RhodeCode
7 7
8 :created_on: Aug 17, 2010 8 :created_on: Aug 17, 2010
9 :author: marcink 9 :author: marcink
10 :copyright: (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com> 10 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
11 :license: GPLv3, see COPYING for more details. 11 :license: GPLv3, see COPYING for more details.
12 """ 12 """
13 # This program is free software: you can redistribute it and/or modify 13 # This program is free software: you can redistribute it and/or modify
14 # it under the terms of the GNU General Public License as published by 14 # it under the terms of the GNU General Public License as published by
15 # the Free Software Foundation, either version 3 of the License, or 15 # the Free Software Foundation, either version 3 of the License, or
35 35
36 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter 36 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter
37 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType 37 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
38 from whoosh.index import create_in, open_dir 38 from whoosh.index import create_in, open_dir
39 from whoosh.formats import Characters 39 from whoosh.formats import Characters
40 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter 40 from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter
41 41
42 from webhelpers.html.builder import escape 42 from webhelpers.html.builder import escape
43 from sqlalchemy import engine_from_config 43 from sqlalchemy import engine_from_config
44 from vcs.utils.lazy import LazyProperty
45 44
46 from rhodecode.model import init_model 45 from rhodecode.model import init_model
47 from rhodecode.model.scm import ScmModel 46 from rhodecode.model.scm import ScmModel
48 from rhodecode.model.repo import RepoModel 47 from rhodecode.model.repo import RepoModel
49 from rhodecode.config.environment import load_environment 48 from rhodecode.config.environment import load_environment
50 from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP 49 from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP, LazyProperty
51 from rhodecode.lib.utils import BasePasterCommand, Command, add_cache 50 from rhodecode.lib.utils import BasePasterCommand, Command, add_cache
52 51
53 #EXTENSIONS WE WANT TO INDEX CONTENT OFF 52 # EXTENSIONS WE WANT TO INDEX CONTENT OFF
54 INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys() 53 INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys()
55 54
56 #CUSTOM ANALYZER wordsplit + lowercase filter 55 # CUSTOM ANALYZER wordsplit + lowercase filter
57 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() 56 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
58 57
59 58
60 #INDEX SCHEMA DEFINITION 59 #INDEX SCHEMA DEFINITION
61 SCHEMA = Schema(owner=TEXT(), 60 SCHEMA = Schema(
62 repository=TEXT(stored=True), 61 owner=TEXT(),
63 path=TEXT(stored=True), 62 repository=TEXT(stored=True),
64 content=FieldType(format=Characters(ANALYZER), 63 path=TEXT(stored=True),
65 scorable=True, stored=True), 64 content=FieldType(format=Characters(), analyzer=ANALYZER,
66 modtime=STORED(), extension=TEXT(stored=True)) 65 scorable=True, stored=True),
67 66 modtime=STORED(),
67 extension=TEXT(stored=True)
68 )
68 69
69 IDX_NAME = 'HG_INDEX' 70 IDX_NAME = 'HG_INDEX'
70 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') 71 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
71 FRAGMENTER = SimpleFragmenter(200) 72 FRAGMENTER = ContextFragmenter(200)
72 73
73 74
74 class MakeIndex(BasePasterCommand): 75 class MakeIndex(BasePasterCommand):
75 76
76 max_args = 1 77 max_args = 1
127 dest='full_index', 128 dest='full_index',
128 help="Specifies that index should be made full i.e" 129 help="Specifies that index should be made full i.e"
129 " destroy old and build from scratch", 130 " destroy old and build from scratch",
130 default=False) 131 default=False)
131 132
133
132 class ResultWrapper(object): 134 class ResultWrapper(object):
133 def __init__(self, search_type, searcher, matcher, highlight_items): 135 def __init__(self, search_type, searcher, matcher, highlight_items):
134 self.search_type = search_type 136 self.search_type = search_type
135 self.searcher = searcher 137 self.searcher = searcher
136 self.matcher = matcher 138 self.matcher = matcher
137 self.highlight_items = highlight_items 139 self.highlight_items = highlight_items
138 self.fragment_size = 200 / 2 140 self.fragment_size = 200
139 141
140 @LazyProperty 142 @LazyProperty
141 def doc_ids(self): 143 def doc_ids(self):
142 docs_id = [] 144 docs_id = []
143 while self.matcher.is_active(): 145 while self.matcher.is_active():
169 """ 171 """
170 Slicing of resultWrapper 172 Slicing of resultWrapper
171 """ 173 """
172 i, j = key.start, key.stop 174 i, j = key.start, key.stop
173 175
174 slice = [] 176 slices = []
175 for docid in self.doc_ids[i:j]: 177 for docid in self.doc_ids[i:j]:
176 slice.append(self.get_full_content(docid)) 178 slices.append(self.get_full_content(docid))
177 return slice 179 return slices
178
179 180
180 def get_full_content(self, docid): 181 def get_full_content(self, docid):
181 res = self.searcher.stored_fields(docid[0]) 182 res = self.searcher.stored_fields(docid[0])
182 f_path = res['path'][res['path'].find(res['repository']) \ 183 f_path = res['path'][res['path'].find(res['repository']) \
183 + len(res['repository']):].lstrip('/') 184 + len(res['repository']):].lstrip('/')
184 185
185 content_short = self.get_short_content(res, docid[1]) 186 content_short = self.get_short_content(res, docid[1])
186 res.update({'content_short':content_short, 187 res.update({'content_short': content_short,
187 'content_short_hl':self.highlight(content_short), 188 'content_short_hl': self.highlight(content_short),
188 'f_path':f_path}) 189 'f_path': f_path})
189 190
190 return res 191 return res
191 192
192 def get_short_content(self, res, chunks): 193 def get_short_content(self, res, chunks):
193 194
196 def get_chunks(self): 197 def get_chunks(self):
197 """ 198 """
198 Smart function that implements chunking the content 199 Smart function that implements chunking the content
199 but not overlap chunks so it doesn't highlight the same 200 but not overlap chunks so it doesn't highlight the same
200 close occurrences twice. 201 close occurrences twice.
201 202
202 :param matcher: 203 :param matcher:
203 :param size: 204 :param size:
204 """ 205 """
205 memory = [(0, 0)] 206 memory = [(0, 0)]
206 for span in self.matcher.spans(): 207 for span in self.matcher.spans():
215 yield (start_offseted, end_offseted,) 216 yield (start_offseted, end_offseted,)
216 217
217 def highlight(self, content, top=5): 218 def highlight(self, content, top=5):
218 if self.search_type != 'content': 219 if self.search_type != 'content':
219 return '' 220 return ''
220 hl = highlight(escape(content), 221 hl = highlight(
221 self.highlight_items, 222 text=escape(content),
222 analyzer=ANALYZER, 223 terms=self.highlight_items,
223 fragmenter=FRAGMENTER, 224 analyzer=ANALYZER,
224 formatter=FORMATTER, 225 fragmenter=FRAGMENTER,
225 top=top) 226 formatter=FORMATTER,
227 top=top
228 )
226 return hl 229 return hl