diff rhodecode/lib/indexers/__init__.py @ 2031:82a88013a3fd

merge 1.3 into stable
author Marcin Kuzminski <marcin@python-works.com>
date Sun, 26 Feb 2012 17:25:09 +0200
parents 752b0a7b7679 b6c902d88472
children dc2584ba5fbc
line wrap: on
line diff
--- a/rhodecode/lib/indexers/__init__.py	Sun Feb 19 20:21:14 2012 +0200
+++ b/rhodecode/lib/indexers/__init__.py	Sun Feb 26 17:25:09 2012 +0200
@@ -7,7 +7,7 @@
 
     :created_on: Aug 17, 2010
     :author: marcink
-    :copyright: (C) 2009-2010 Marcin Kuzminski <marcin@python-works.com>
+    :copyright: (C) 2010-2012 Marcin Kuzminski <marcin@python-works.com>
     :license: GPLv3, see COPYING for more details.
 """
 # This program is free software: you can redistribute it and/or modify
@@ -37,38 +37,39 @@
 from whoosh.fields import TEXT, ID, STORED, Schema, FieldType
 from whoosh.index import create_in, open_dir
 from whoosh.formats import Characters
-from whoosh.highlight import highlight, SimpleFragmenter, HtmlFormatter
+from whoosh.highlight import highlight, HtmlFormatter, ContextFragmenter
 
 from webhelpers.html.builder import escape
 from sqlalchemy import engine_from_config
-from vcs.utils.lazy import LazyProperty
 
 from rhodecode.model import init_model
 from rhodecode.model.scm import ScmModel
 from rhodecode.model.repo import RepoModel
 from rhodecode.config.environment import load_environment
-from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP
+from rhodecode.lib import LANGUAGES_EXTENSIONS_MAP, LazyProperty
 from rhodecode.lib.utils import BasePasterCommand, Command, add_cache
 
-#EXTENSIONS WE WANT TO INDEX CONTENT OFF
+# EXTENSIONS WE WANT TO INDEX CONTENT OFF
 INDEX_EXTENSIONS = LANGUAGES_EXTENSIONS_MAP.keys()
 
-#CUSTOM ANALYZER wordsplit + lowercase filter
+# CUSTOM ANALYZER wordsplit + lowercase filter
 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 
 
 #INDEX SCHEMA DEFINITION
-SCHEMA = Schema(owner=TEXT(),
-                repository=TEXT(stored=True),
-                path=TEXT(stored=True),
-                content=FieldType(format=Characters(ANALYZER),
-                             scorable=True, stored=True),
-                modtime=STORED(), extension=TEXT(stored=True))
-
+SCHEMA = Schema(
+    owner=TEXT(),
+    repository=TEXT(stored=True),
+    path=TEXT(stored=True),
+    content=FieldType(format=Characters(), analyzer=ANALYZER,
+                      scorable=True, stored=True),
+    modtime=STORED(),
+    extension=TEXT(stored=True)
+)
 
 IDX_NAME = 'HG_INDEX'
 FORMATTER = HtmlFormatter('span', between='\n<span class="break">...</span>\n')
-FRAGMENTER = SimpleFragmenter(200)
+FRAGMENTER = ContextFragmenter(200)
 
 
 class MakeIndex(BasePasterCommand):
@@ -129,13 +130,14 @@
                                 " destroy old and build from scratch",
                           default=False)
 
+
 class ResultWrapper(object):
     def __init__(self, search_type, searcher, matcher, highlight_items):
         self.search_type = search_type
         self.searcher = searcher
         self.matcher = matcher
         self.highlight_items = highlight_items
-        self.fragment_size = 200 / 2
+        self.fragment_size = 200
 
     @LazyProperty
     def doc_ids(self):
@@ -171,11 +173,10 @@
         """
         i, j = key.start, key.stop
 
-        slice = []
+        slices = []
         for docid in self.doc_ids[i:j]:
-            slice.append(self.get_full_content(docid))
-        return slice
-
+            slices.append(self.get_full_content(docid))
+        return slices
 
     def get_full_content(self, docid):
         res = self.searcher.stored_fields(docid[0])
@@ -183,9 +184,9 @@
                              + len(res['repository']):].lstrip('/')
 
         content_short = self.get_short_content(res, docid[1])
-        res.update({'content_short':content_short,
-                    'content_short_hl':self.highlight(content_short),
-                    'f_path':f_path})
+        res.update({'content_short': content_short,
+                    'content_short_hl': self.highlight(content_short),
+                    'f_path': f_path})
 
         return res
 
@@ -198,7 +199,7 @@
         Smart function that implements chunking the content
         but not overlap chunks so it doesn't highlight the same
         close occurrences twice.
-        
+
         :param matcher:
         :param size:
         """
@@ -217,10 +218,12 @@
     def highlight(self, content, top=5):
         if self.search_type != 'content':
             return ''
-        hl = highlight(escape(content),
-                 self.highlight_items,
-                 analyzer=ANALYZER,
-                 fragmenter=FRAGMENTER,
-                 formatter=FORMATTER,
-                 top=top)
+        hl = highlight(
+            text=escape(content),
+            terms=self.highlight_items,
+            analyzer=ANALYZER,
+            fragmenter=FRAGMENTER,
+            formatter=FORMATTER,
+            top=top
+        )
         return hl