changeset 443:e5157e2a530e

added safe unicode funtion, and implemented it in whoosh indexer
author Marcin Kuzminski <marcin@python-works.com>
date Wed, 01 Sep 2010 23:38:03 +0200
parents d66a7fa7689b
children 0668919c307c
files pylons_app/lib/helpers.py pylons_app/lib/indexers/daemon.py
diffstat 2 files changed, 18 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/pylons_app/lib/helpers.py	Wed Sep 01 23:32:47 2010 +0200
+++ b/pylons_app/lib/helpers.py	Wed Sep 01 23:38:03 2010 +0200
@@ -336,3 +336,19 @@
     gravatar_url += urllib.urlencode({'d':default, 's':str(size)})
 
     return gravatar_url
+
+def safe_unicode(str):
+    """safe unicode function. In case of UnicodeDecode error we try to return
+    unicode with errors replace, if this failes we return unicode with 
+    string_escape decoding """
+    
+    try:
+        u_str = unicode(str)
+    except UnicodeDecodeError:
+        try:
+            u_str = unicode(str, 'utf-8', 'replace')
+        except UnicodeDecodeError:
+            #incase we have a decode error just represent as byte string
+            u_str = unicode(str(str).encode('string_escape'))
+        
+    return u_str
\ No newline at end of file
--- a/pylons_app/lib/indexers/daemon.py	Wed Sep 01 23:32:47 2010 +0200
+++ b/pylons_app/lib/indexers/daemon.py	Wed Sep 01 23:38:03 2010 +0200
@@ -36,6 +36,7 @@
 import traceback
 from pylons_app.config.environment import load_environment
 from pylons_app.model.hg_model import HgModel
+from pylons_app.lib.helpers import safe_unicode
 from whoosh.index import create_in, open_dir
 from shutil import rmtree
 from pylons_app.lib.indexers import ANALYZER, INDEX_EXTENSIONS, IDX_LOCATION, \
@@ -77,11 +78,7 @@
             fobj = open(path, 'rb')
             content = fobj.read()
             fobj.close()
-            try:
-                u_content = unicode(content)
-            except UnicodeDecodeError:
-                #incase we have a decode error just represent as byte string
-                u_content = unicode(str(content).encode('string_escape'))
+            u_content = safe_unicode(content)
         else:
             log.debug('    >> %s' % path)
             #just index file name without it's content