changeset 2643:2ad50c44b025 beta

when indexing changesets use the raw_id to locate the point from which to start indexing rather than the revision which can be unreliable.
author Indra Talip <indra.talip@gmail.com>
date Sat, 21 Jul 2012 08:20:32 +0200
parents 88b0e82bcba4
children bed929b24f63
files rhodecode/lib/indexers/__init__.py rhodecode/lib/indexers/daemon.py rhodecode/tests/functional/test_search.py
diffstat 3 files changed, 28 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/rhodecode/lib/indexers/__init__.py	Sat Jul 21 06:01:32 2012 +0200
+++ b/rhodecode/lib/indexers/__init__.py	Sat Jul 21 08:20:32 2012 +0200
@@ -74,7 +74,6 @@
 
 CHGSETS_SCHEMA = Schema(
     raw_id=ID(unique=True, stored=True),
-    revision=NUMERIC(unique=True, stored=True),
     last=BOOLEAN(),
     owner=TEXT(),
     repository=ID(unique=True, stored=True),
--- a/rhodecode/lib/indexers/daemon.py	Sat Jul 21 06:01:32 2012 +0200
+++ b/rhodecode/lib/indexers/daemon.py	Sat Jul 21 08:20:32 2012 +0200
@@ -168,23 +168,34 @@
         )
         return indexed, indexed_w_content
 
-    def index_changesets(self, writer, repo_name, repo, start_rev=0):
+    def index_changesets(self, writer, repo_name, repo, start_rev=None):
         """
         Add all changeset in the vcs repo starting at start_rev
         to the index writer
+
+        :param writer: the whoosh index writer to add to
+        :param repo_name: name of the repository from whence the
+          changeset originates including the repository group
+        :param repo: the vcs repository instance to index changesets for,
+          the presumption is the repo has changesets to index
+        :param start_rev=None: the full sha id to start indexing from
+          if start_rev is None then index from the first changeset in
+          the repo
         """
 
-        log.debug('indexing changesets in %s[%d:]' % (repo_name, start_rev))
+        if start_rev is None:
+            start_rev = repo[0].raw_id
+
+        log.debug('indexing changesets in %s starting at rev: %s' % (repo_name, start_rev))
 
         indexed=0
-        for cs in repo[start_rev:]:
+        for cs in repo.get_changesets(start=start_rev):
             writer.add_document(
                 raw_id=unicode(cs.raw_id),
                 owner=unicode(repo.contact),
                 repository=safe_unicode(repo_name),
                 author=cs.author,
                 message=cs.message,
-                revision=cs.revision,
                 last=cs.last,
                 added=u' '.join([node.path for node in cs.added]).lower(),
                 removed=u' '.join([node.path for node in cs.removed]).lower(),
@@ -214,21 +225,27 @@
             try:
                 for repo_name, repo in self.repo_paths.items():
                     # skip indexing if there aren't any revs in the repo
-                    revs = repo.revisions
-                    if len(revs) < 1:
+                    num_of_revs = len(repo)
+                    if num_of_revs < 1:
                         continue
 
                     qp = QueryParser('repository', schema=CHGSETS_SCHEMA)
                     q = qp.parse(u"last:t AND %s" % repo_name)
 
-                    results = searcher.search(q, sortedby='revision')
+                    results = searcher.search(q)
 
+                    # default to scanning the entire repo
                     last_rev = 0
+                    start_id = None
+
                     if len(results) > 0:
-                        last_rev = results[0]['revision']
+                        # assuming that there is only one result, if not this
+                        # may require a full re-index.
+                        start_id = results[0]['raw_id']
+                        last_rev = repo.get_changeset(revision=start_id).revision
 
                     # there are new changesets to index or a new repo to index
-                    if last_rev == 0 or len(revs) > last_rev + 1:
+                    if last_rev == 0 or num_of_revs > last_rev + 1:
                         # delete the docs in the index for the previous last changeset(s)
                         for hit in results:
                             q = qp.parse(u"last:t AND %s AND raw_id:%s" % 
@@ -236,7 +253,7 @@
                             writer.delete_by_query(q)
 
                         # index from the previous last changeset + all new ones
-                        self.index_changesets(writer, repo_name, repo, last_rev)
+                        self.index_changesets(writer, repo_name, repo, start_id)
                         writer_is_dirty = True
 
             finally:
--- a/rhodecode/tests/functional/test_search.py	Sat Jul 21 06:01:32 2012 +0200
+++ b/rhodecode/tests/functional/test_search.py	Sat Jul 21 08:20:32 2012 +0200
@@ -72,7 +72,7 @@
     def test_search_author(self):
         self.log_user()
         response = self.app.get(url(controller='search', action='index'),
-                                {'q': 'author:marcin@python-blog.com revision:0',
+                                {'q': 'author:marcin@python-blog.com raw_id:b986218ba1c9b0d6a259fac9b050b1724ed8e545',
                                  'type': 'commit'})
 
         response.mustcontain('1 results')