Mercurial > kallithea
comparison rhodecode/lib/indexers/__init__.py @ 2031:82a88013a3fd
merge 1.3 into stable
author | Marcin Kuzminski <marcin@python-works.com> |
---|---|
date | Sun, 26 Feb 2012 17:25:09 +0200 |
parents | 752b0a7b7679 b6c902d88472 |
children | dc2584ba5fbc |
comparison
equal
deleted
inserted
replaced
2005:ab0e122b38a7 | 2031:82a88013a3fd |
---|---|
5 | 5 |
6 Whoosh indexing module for RhodeCode | 6 Whoosh indexing module for RhodeCode |
7 | 7 |
8 :created_on: Aug 17, 2010 | 8 :created_on: Aug 17, 2010 |
9 :author: marcink | 9 :author: marcink |
10 :copyright: (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com> | 10 :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com> |
11 :license: GPLv3, see COPYING for more details. | 11 :license: GPLv3, see COPYING for more details. |
12 """ | 12 """ |
13 # This program is free software: you can redistribute it and/or modify | 13 # This program is free software: you can redistribute it and/or modify |
14 # it under the terms of the GNU General Public License as published by | 14 # it under the terms of the GNU General Public License as published by |
15 # the Free Software Foundation, either version 3 of the License, or | 15 # the Free Software Foundation, either version 3 of the License, or |
35 | 35 |
36 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter | 36 from whoosh.analysis import RegexTokenizer, LowercaseFilter, StopFilter |
37 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType | 37 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType |
38 from whoosh.index import create_in, open_dir | 38 from whoosh.index import create_in, open_dir |
39 from whoosh.formats import Characters | 39 from whoosh.formats import Characters |
40 from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter | 40 from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter |
41 | 41 |
42 from webhelpers.html.builder import escape | 42 from webhelpers.html.builder import escape |
43 from sqlalchemy import engine_from_config | 43 from sqlalchemy import engine_from_config |
44 from vcs.utils.lazy import LazyProperty | |
45 | 44 |
46 from rhodecode.model import init_model | 45 from rhodecode.model import init_model |
47 from rhodecode.model.scm import ScmModel | 46 from rhodecode.model.scm import ScmModel |
48 from rhodecode.model.repo import RepoModel | 47 from rhodecode.model.repo import RepoModel |
49 from rhodecode.config.environment import load_environment | 48 from rhodecode.config.environment import load_environment |
50 from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP | 49 from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP, LazyProperty |
51 from rhodecode.lib.utils import BasePasterCommand, Command, add_cache | 50 from rhodecode.lib.utils import BasePasterCommand, Command, add_cache |
52 | 51 |
53 #EXTENSIONS WE WANT TO INDEX CONTENT OFF | 52 # EXTENSIONS WE WANT TO INDEX CONTENT OFF |
54 INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys() | 53 INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys() |
55 | 54 |
56 #CUSTOM ANALYZER wordsplit + lowercase filter | 55 # CUSTOM ANALYZER wordsplit + lowercase filter |
57 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() | 56 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() |
58 | 57 |
59 | 58 |
60 #INDEX SCHEMA DEFINITION | 59 #INDEX SCHEMA DEFINITION |
61 SCHEMA = Schema(owner=TEXT(), | 60 SCHEMA = Schema( |
62 repository=TEXT(stored=True), | 61 owner=TEXT(), |
63 path=TEXT(stored=True), | 62 repository=TEXT(stored=True), |
64 content=FieldType(format=Characters(ANALYZER), | 63 path=TEXT(stored=True), |
65 scorable=True, stored=True), | 64 content=FieldType(format=Characters(), analyzer=ANALYZER, |
66 modtime=STORED(), extension=TEXT(stored=True)) | 65 scorable=True, stored=True), |
67 | 66 modtime=STORED(), |
67 extension=TEXT(stored=True) | |
68 ) | |
68 | 69 |
69 IDX_NAME = 'HG_INDEX' | 70 IDX_NAME = 'HG_INDEX' |
70 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') | 71 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n') |
71 FRAGMENTER = SimpleFragmenter(200) | 72 FRAGMENTER = ContextFragmenter(200) |
72 | 73 |
73 | 74 |
74 class MakeIndex(BasePasterCommand): | 75 class MakeIndex(BasePasterCommand): |
75 | 76 |
76 max_args = 1 | 77 max_args = 1 |
127 dest='full_index', | 128 dest='full_index', |
128 help="Specifies that index should be made full i.e" | 129 help="Specifies that index should be made full i.e" |
129 " destroy old and build from scratch", | 130 " destroy old and build from scratch", |
130 default=False) | 131 default=False) |
131 | 132 |
133 | |
132 class ResultWrapper(object): | 134 class ResultWrapper(object): |
133 def __init__(self, search_type, searcher, matcher, highlight_items): | 135 def __init__(self, search_type, searcher, matcher, highlight_items): |
134 self.search_type = search_type | 136 self.search_type = search_type |
135 self.searcher = searcher | 137 self.searcher = searcher |
136 self.matcher = matcher | 138 self.matcher = matcher |
137 self.highlight_items = highlight_items | 139 self.highlight_items = highlight_items |
138 self.fragment_size = 200 / 2 | 140 self.fragment_size = 200 |
139 | 141 |
140 @LazyProperty | 142 @LazyProperty |
141 def doc_ids(self): | 143 def doc_ids(self): |
142 docs_id = [] | 144 docs_id = [] |
143 while self.matcher.is_active(): | 145 while self.matcher.is_active(): |
169 """ | 171 """ |
170 Slicing of resultWrapper | 172 Slicing of resultWrapper |
171 """ | 173 """ |
172 i, j = key.start, key.stop | 174 i, j = key.start, key.stop |
173 | 175 |
174 slice = [] | 176 slices = [] |
175 for docid in self.doc_ids[i:j]: | 177 for docid in self.doc_ids[i:j]: |
176 slice.append(self.get_full_content(docid)) | 178 slices.append(self.get_full_content(docid)) |
177 return slice | 179 return slices |
178 | |
179 | 180 |
180 def get_full_content(self, docid): | 181 def get_full_content(self, docid): |
181 res = self.searcher.stored_fields(docid[0]) | 182 res = self.searcher.stored_fields(docid[0]) |
182 f_path = res['path'][res['path'].find(res['repository']) \ | 183 f_path = res['path'][res['path'].find(res['repository']) \ |
183 + len(res['repository']):].lstrip('/') | 184 + len(res['repository']):].lstrip('/') |
184 | 185 |
185 content_short = self.get_short_content(res, docid[1]) | 186 content_short = self.get_short_content(res, docid[1]) |
186 res.update({'content_short':content_short, | 187 res.update({'content_short': content_short, |
187 'content_short_hl':self.highlight(content_short), | 188 'content_short_hl': self.highlight(content_short), |
188 'f_path':f_path}) | 189 'f_path': f_path}) |
189 | 190 |
190 return res | 191 return res |
191 | 192 |
192 def get_short_content(self, res, chunks): | 193 def get_short_content(self, res, chunks): |
193 | 194 |
196 def get_chunks(self): | 197 def get_chunks(self): |
197 """ | 198 """ |
198 Smart function that implements chunking the content | 199 Smart function that implements chunking the content |
199 but not overlap chunks so it doesn't highlight the same | 200 but not overlap chunks so it doesn't highlight the same |
200 close occurrences twice. | 201 close occurrences twice. |
201 | 202 |
202 :param matcher: | 203 :param matcher: |
203 :param size: | 204 :param size: |
204 """ | 205 """ |
205 memory = [(0, 0)] | 206 memory = [(0, 0)] |
206 for span in self.matcher.spans(): | 207 for span in self.matcher.spans(): |
215 yield (start_offseted, end_offseted,) | 216 yield (start_offseted, end_offseted,) |
216 | 217 |
217 def highlight(self, content, top=5): | 218 def highlight(self, content, top=5): |
218 if self.search_type != 'content': | 219 if self.search_type != 'content': |
219 return '' | 220 return '' |
220 hl = highlight(escape(content), | 221 hl = highlight( |
221 self.highlight_items, | 222 text=escape(content), |
222 analyzer=ANALYZER, | 223 terms=self.highlight_items, |
223 fragmenter=FRAGMENTER, | 224 analyzer=ANALYZER, |
224 formatter=FORMATTER, | 225 fragmenter=FRAGMENTER, |
225 top=top) | 226 formatter=FORMATTER, |
227 top=top | |
228 ) | |
226 return hl | 229 return hl |