# HG changeset patch
# User FUJIWARA Katsunori <foozy@lares.dti.ne.jp>
# Date 1485105458 -32400
# Node ID c0b2410d63a584a49c77605dfbf0358d132eafb6
# Parent  168cc92c1b538d9cccf6f89dd08f43add8cc844f
search: prevent username related conditions from removing "stop words"

Before this revision, username related conditions below cause
unintentional ignorance of "stop words".

  - owner: (for all)
  - author: (for "Commit messages")

Therefore, username related conditions with "this", "a", "you", and so
on are completely ignored, even if they are valid username components.

To prevent username related conditions from removing "stop words",
this revision explicitly specifies "analyzer" for username related
fields of SCHEMA and CHGSETS_SCHEMA.

Difference between EMAILADDRANALYZER and default analyzer of TEXT is
whether "stop words" are preserved or not. Tokenization is still
applied on usernames.

For future changing, this revision doesn't make EMAILADDRANALYZER
share analyzer definition with PATHANALYZER, even though their
definitions are identical with each other at this revision.

This revision requires full re-building index tables, because indexing
schemas are changed.

Original patch has been modified by Mads Kiilerich - tests of 'owner' will be
addressed separately.

diff -r 168cc92c1b53 -r c0b2410d63a5 kallithea/lib/indexers/__init__.py
--- a/kallithea/lib/indexers/__init__.py	Mon Jan 23 02:17:38 2017 +0900
+++ b/kallithea/lib/indexers/__init__.py	Mon Jan 23 02:17:38 2017 +0900
@@ -44,6 +44,14 @@
 # CUSTOM ANALYZER wordsplit + lowercase filter
 ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter()
 
+# CUSTOM ANALYZER wordsplit + lowercase filter, for emailaddr-like text
+#
+# This is useful to:
+# - avoid removing "stop words" from text
+# - search case-insensitively
+#
+EMAILADDRANALYZER =  RegexTokenizer() | LowercaseFilter()
+
 # CUSTOM ANALYZER raw-string + lowercase filter
 #
 # This is useful to:
@@ -72,7 +80,7 @@
 #INDEX SCHEMA DEFINITION
 SCHEMA = Schema(
     fileid=ID(unique=True),
-    owner=TEXT(),
+    owner=TEXT(analyzer=EMAILADDRANALYZER),
     # this field preserves case of repository name for exact matching
     repository_rawname=TEXT(analyzer=IDANALYZER),
     repository=TEXT(stored=True, analyzer=ICASEIDANALYZER),
@@ -91,12 +99,12 @@
     raw_id=ID(unique=True, stored=True),
     date=NUMERIC(stored=True),
     last=BOOLEAN(),
-    owner=TEXT(),
+    owner=TEXT(analyzer=EMAILADDRANALYZER),
     # this field preserves case of repository name for exact matching
     # and unique-ness in index table
     repository_rawname=ID(unique=True),
     repository=ID(stored=True, analyzer=ICASEIDANALYZER),
-    author=TEXT(stored=True),
+    author=TEXT(stored=True, analyzer=EMAILADDRANALYZER),
     message=FieldType(format=Characters(), analyzer=ANALYZER,
                       scorable=True, stored=True),
     parents=TEXT(),
diff -r 168cc92c1b53 -r c0b2410d63a5 kallithea/tests/functional/test_search_indexing.py
--- a/kallithea/tests/functional/test_search_indexing.py	Mon Jan 23 02:17:38 2017 +0900
+++ b/kallithea/tests/functional/test_search_indexing.py	Mon Jan 23 02:17:38 2017 +0900
@@ -188,8 +188,8 @@
         ('commit', 'owner:"this-is-it"', 0),
 
         # matching against only 1 revision
-        ('commit', 'author:"this is it"', 0),
-        ('commit', 'author:"this-is-it"', 0),
+        ('commit', 'author:"this is it"', 1),
+        ('commit', 'author:"this-is-it"', 1),
     ])
     def test_mailaddr_stopword(self, searchtype, query, hit):
         response = self.app.get(url(controller='search', action='index'),