changeset 6478:c0b2410d63a5

search: prevent username related conditions from removing "stop words" Before this revision, username related conditions below cause unintentional ignorance of "stop words". - owner: (for all) - author: (for "Commit messages") Therefore, username related conditions with "this", "a", "you", and so on are completely ignored, even if they are valid username components. To prevent username related conditions from removing "stop words", this revision explicitly specifies "analyzer" for username related fields of SCHEMA and CHGSETS_SCHEMA. Difference between EMAILADDRANALYZER and default analyzer of TEXT is whether "stop words" are preserved or not. Tokenization is still applied on usernames. For future changing, this revision doesn't make EMAILADDRANALYZER share analyzer definition with PATHANALYZER, even though their definitions are identical with each other at this revision. This revision requires full re-building index tables, because indexing schemas are changed. Original patch has been modified by Mads Kiilerich - tests of 'owner' will be addressed separately.
author FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
date Mon, 23 Jan 2017 02:17:38 +0900
parents 168cc92c1b53
children 925d21b872e7
files kallithea/lib/indexers/__init__.py kallithea/tests/functional/test_search_indexing.py
diffstat 2 files changed, 13 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/kallithea/lib/indexers/__init__.py	Mon Jan 23 02:17:38 2017 +0900
+++ b/kallithea/lib/indexers/__init__.py	Mon Jan 23 02:17:38 2017 +0900
@@ -44,6 +44,14 @@
 # CUSTOM ANALYZER wordsplit + lowercase filter
 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 
+# CUSTOM ANALYZER wordsplit + lowercase filter, for emailaddr-like text
+#
+# This is useful to:
+# - avoid removing "stop words" from text
+# - search case-insensitively
+#
+EMAILADDRANALYZER =  RegexTokenizer() | LowercaseFilter()
+
 # CUSTOM ANALYZER raw-string + lowercase filter
 #
 # This is useful to:
@@ -72,7 +80,7 @@
 #INDEX SCHEMA DEFINITION
 SCHEMA = Schema(
     fileid=ID(unique=True),
-    owner=TEXT(),
+    owner=TEXT(analyzer=EMAILADDRANALYZER),
     # this field preserves case of repository name for exact matching
     repository_rawname=TEXT(analyzer=IDANALYZER),
     repository=TEXT(stored=True, analyzer=ICASEIDANALYZER),
@@ -91,12 +99,12 @@
     raw_id=ID(unique=True, stored=True),
     date=NUMERIC(stored=True),
     last=BOOLEAN(),
-    owner=TEXT(),
+    owner=TEXT(analyzer=EMAILADDRANALYZER),
     # this field preserves case of repository name for exact matching
     # and unique-ness in index table
     repository_rawname=ID(unique=True),
     repository=ID(stored=True, analyzer=ICASEIDANALYZER),
-    author=TEXT(stored=True),
+    author=TEXT(stored=True, analyzer=EMAILADDRANALYZER),
     message=FieldType(format=Characters(), analyzer=ANALYZER,
                       scorable=True, stored=True),
     parents=TEXT(),
--- a/kallithea/tests/functional/test_search_indexing.py	Mon Jan 23 02:17:38 2017 +0900
+++ b/kallithea/tests/functional/test_search_indexing.py	Mon Jan 23 02:17:38 2017 +0900
@@ -188,8 +188,8 @@
         ('commit', 'owner:"this-is-it"', 0),
 
         # matching against only 1 revision
-        ('commit', 'author:"this is it"', 0),
-        ('commit', 'author:"this-is-it"', 0),
+        ('commit', 'author:"this is it"', 1),
+        ('commit', 'author:"this-is-it"', 1),
     ])
     def test_mailaddr_stopword(self, searchtype, query, hit):
         response = self.app.get(url(controller='search', action='index'),