# HG changeset patch # User FUJIWARA Katsunori # Date 1485105458 -32400 # Node ID c0b2410d63a584a49c77605dfbf0358d132eafb6 # Parent 168cc92c1b538d9cccf6f89dd08f43add8cc844f search: prevent username related conditions from removing "stop words" Before this revision, username related conditions below cause unintentional ignorance of "stop words". - owner: (for all) - author: (for "Commit messages") Therefore, username related conditions with "this", "a", "you", and so on are completely ignored, even if they are valid username components. To prevent username related conditions from removing "stop words", this revision explicitly specifies "analyzer" for username related fields of SCHEMA and CHGSETS_SCHEMA. Difference between EMAILADDRANALYZER and default analyzer of TEXT is whether "stop words" are preserved or not. Tokenization is still applied on usernames. For future changing, this revision doesn't make EMAILADDRANALYZER share analyzer definition with PATHANALYZER, even though their definitions are identical with each other at this revision. This revision requires full re-building index tables, because indexing schemas are changed. Original patch has been modified by Mads Kiilerich - tests of 'owner' will be addressed separately. diff -r 168cc92c1b53 -r c0b2410d63a5 kallithea/lib/indexers/__init__.py --- a/kallithea/lib/indexers/__init__.py Mon Jan 23 02:17:38 2017 +0900 +++ b/kallithea/lib/indexers/__init__.py Mon Jan 23 02:17:38 2017 +0900 @@ -44,6 +44,14 @@ # CUSTOM ANALYZER wordsplit + lowercase filter ANALYZER = RegexTokenizer(expression=r"\w+") | LowercaseFilter() +# CUSTOM ANALYZER wordsplit + lowercase filter, for emailaddr-like text +# +# This is useful to: +# - avoid removing "stop words" from text +# - search case-insensitively +# +EMAILADDRANALYZER = RegexTokenizer() | LowercaseFilter() + # CUSTOM ANALYZER raw-string + lowercase filter # # This is useful to: @@ -72,7 +80,7 @@ #INDEX SCHEMA DEFINITION SCHEMA = Schema( fileid=ID(unique=True), - owner=TEXT(), + owner=TEXT(analyzer=EMAILADDRANALYZER), # this field preserves case of repository name for exact matching repository_rawname=TEXT(analyzer=IDANALYZER), repository=TEXT(stored=True, analyzer=ICASEIDANALYZER), @@ -91,12 +99,12 @@ raw_id=ID(unique=True, stored=True), date=NUMERIC(stored=True), last=BOOLEAN(), - owner=TEXT(), + owner=TEXT(analyzer=EMAILADDRANALYZER), # this field preserves case of repository name for exact matching # and unique-ness in index table repository_rawname=ID(unique=True), repository=ID(stored=True, analyzer=ICASEIDANALYZER), - author=TEXT(stored=True), + author=TEXT(stored=True, analyzer=EMAILADDRANALYZER), message=FieldType(format=Characters(), analyzer=ANALYZER, scorable=True, stored=True), parents=TEXT(), diff -r 168cc92c1b53 -r c0b2410d63a5 kallithea/tests/functional/test_search_indexing.py --- a/kallithea/tests/functional/test_search_indexing.py Mon Jan 23 02:17:38 2017 +0900 +++ b/kallithea/tests/functional/test_search_indexing.py Mon Jan 23 02:17:38 2017 +0900 @@ -188,8 +188,8 @@ ('commit', 'owner:"this-is-it"', 0), # matching against only 1 revision - ('commit', 'author:"this is it"', 0), - ('commit', 'author:"this-is-it"', 0), + ('commit', 'author:"this is it"', 1), + ('commit', 'author:"this-is-it"', 1), ]) def test_mailaddr_stopword(self, searchtype, query, hit): response = self.app.get(url(controller='search', action='index'),