changeset 6477:168cc92c1b53

search: prevent pathname related conditions from removing "stop words" Before this revision, pathname related conditions below cause unintentional ignorance of "stop words". - path:,extension: (for "File contents" or "File names") - added:, removed:, changed: (for "Commit messages") Therefore, pathname related conditions with "this", "a", "you", and so on are completely ignored, even if they are valid pathname components. To prevent pathname related conditions from removing "stop words", this revision explicitly specifies "analyzer" for pathname related fields of SCHEMA and CHGSETS_SCHEMA. Difference between PATHANALYZER and default analyzer of TEXT is whether "stop words" are preserved or not. Tokenization is still applied on pathnames. This revision requires full re-building index tables, because indexing schemas are changed.
author FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
date Mon, 23 Jan 2017 02:17:38 +0900
parents 8b7c0ef62427
children c0b2410d63a5
files kallithea/lib/indexers/__init__.py kallithea/tests/functional/test_search_indexing.py
diffstat 2 files changed, 19 insertions(+), 11 deletions(-) [+]
line wrap: on
line diff
--- a/kallithea/lib/indexers/__init__.py	Mon Jan 23 02:17:38 2017 +0900
+++ b/kallithea/lib/indexers/__init__.py	Mon Jan 23 02:17:38 2017 +0900
@@ -61,6 +61,14 @@
 #
 IDANALYZER = IDTokenizer()
 
+# CUSTOM ANALYZER wordsplit + lowercase filter, for pathname-like text
+#
+# This is useful to:
+# - avoid removing "stop words" from text
+# - search case-insensitively
+#
+PATHANALYZER = RegexTokenizer() | LowercaseFilter()
+
 #INDEX SCHEMA DEFINITION
 SCHEMA = Schema(
     fileid=ID(unique=True),
@@ -68,11 +76,11 @@
     # this field preserves case of repository name for exact matching
     repository_rawname=TEXT(analyzer=IDANALYZER),
     repository=TEXT(stored=True, analyzer=ICASEIDANALYZER),
-    path=TEXT(stored=True),
+    path=TEXT(stored=True, analyzer=PATHANALYZER),
     content=FieldType(format=Characters(), analyzer=ANALYZER,
                       scorable=True, stored=True),
     modtime=STORED(),
-    extension=TEXT(stored=True)
+    extension=TEXT(stored=True, analyzer=PATHANALYZER)
 )
 
 IDX_NAME = 'HG_INDEX'
@@ -92,9 +100,9 @@
     message=FieldType(format=Characters(), analyzer=ANALYZER,
                       scorable=True, stored=True),
     parents=TEXT(),
-    added=TEXT(),
-    removed=TEXT(),
-    changed=TEXT(),
+    added=TEXT(analyzer=PATHANALYZER),
+    removed=TEXT(analyzer=PATHANALYZER),
+    changed=TEXT(analyzer=PATHANALYZER),
 )
 
 CHGSET_IDX_NAME = 'CHGSET_INDEX'
--- a/kallithea/tests/functional/test_search_indexing.py	Mon Jan 23 02:17:38 2017 +0900
+++ b/kallithea/tests/functional/test_search_indexing.py	Mon Jan 23 02:17:38 2017 +0900
@@ -156,19 +156,19 @@
 
         # confirm that there is no matching against lower name repository
         assert uname in response
-        #assert lname not in response
+        assert lname not in response
 
     @parametrize('searchtype,query,hit', [
-        ('content', 'path:this/is/it def test', 37),
-        ('commit', 'added:this/is/it bother to ask where', 4),
+        ('content', 'path:this/is/it def test', 1),
+        ('commit', 'added:this/is/it bother to ask where', 1),
         # this condition matches against files below, because
         # "path:" condition is also applied on "repository path".
         # - "this/is/it" in "stopword_test" repo
         # - "this_should_be_unique_filename.txt" in "this-is-it" repo
-        ('path', 'this/is/it', 0),
+        ('path', 'this/is/it', 2),
 
-        ('content', 'extension:us', 0),
-        ('path', 'extension:us', 0),
+        ('content', 'extension:us', 1),
+        ('path', 'extension:us', 1),
     ])
     def test_filename_stopword(self, searchtype, query, hit):
         response = self.app.get(url(controller='search', action='index'),