Date: Wed, 27 Apr 2005 23:50:47 +0200
From: Andreas Jobs <andreas.jobs+htdig-dev@ruhr-uni-bochum.de>
To: htdig-dev@lists.sourceforge.net
Subject: [htdig-dev] Simple UTF-8 support patch


Hi,

I recently had the following problem: Due to the use of a CMS some of our pages
are now UTF-8 encoded. Since we are a german university our pages may contain
german umlauts ;-) I use ht://Dig to index all servers on the campus. The
problem is/was, that we cannot find words with umlauts on those UTF-8 pages.

First workaround: add accept-charset="ISO-8859-1" to the ht://Dig search form.
Now we can find words with umlauts on old (non UTF-8)pages but not one the new
(UTF-8) pages.

Attached you'll find a patch, that does a simple UTF-8 to 8bit ASCII
conversion. All non-convertable characters are are mapped to a questionmark(?). 

ReadBody may not be the best place to add this code (and it should be added to
ReadChunkedBody as well), but it was the easiest way to achieve my goal. One
may give me a hint for a better place :-)

Comments welcome ....

Andreas

-- 
! Andreas Jobs                                 Network Operating Center !
!                                              Ruhr-Universitaet Bochum !
! The only way to clean a compromised system is to flatten and rebuild. !


diff -ur htdig-3.2.0b6.orig/htnet/HtHTTP.cc htdig-3.2.0b6/htnet/HtHTTP.cc
--- htdig-3.2.0b6.orig/htnet/HtHTTP.cc	2004-05-28 15:15:23.000000000 +0200
+++ htdig-3.2.0b6/htnet/HtHTTP.cc	2005-04-27 23:26:16.000000000 +0200
@@ -643,6 +643,8 @@
     String	line = 0;
     int		inHeader = 1;
 
+    _needUTF8Convert = 0;
+
     if (_response._modification_time)
     {
 	delete _response._modification_time;
@@ -731,7 +733,15 @@
             token = strtok(token, "\n\t");
 
             if (token && *token)
+            {
                _response._content_type = token;
+               if ((_response._content_type.indexOf("text/html") != -1) && (_response._content_type.indexOf("UTF-8") != -1))
+               {
+                  if ( debug > 4 )
+                     cout << "needUTF8Convert flagged" << endl;
+                  _needUTF8Convert = 1;
+               }
+            }
 
          }
          else if( ! mystrncasecmp((char*)line, "content-length:", 15))
@@ -970,6 +980,31 @@
 
     }
 
+    if ( _needUTF8Convert )
+    {
+        if ( debug > 4 )
+            cout << "Converting UTF-8 characters" << endl;
+
+        char *srcPtr, *dstPtr;
+        srcPtr = dstPtr = _response._contents.get();
+        while ( *srcPtr )
+        {
+            if ( ( *srcPtr & 0x80 ) == 0 )
+                *dstPtr++ = *srcPtr++;
+            else if ( ( *srcPtr & 0xE0 ) == 0xC0 ) {
+                *dstPtr++ = (((*srcPtr & 0x03) << 6) | (*(srcPtr+1) & 0x3F)  ) & 0xFF;
+                srcPtr += 2;
+            } else if ( ( *srcPtr & 0xF0 ) == 0xE0 ) {
+                *dstPtr++ = '?';
+                srcPtr += 3;
+            } else {
+                *dstPtr++ = '?';
+                srcPtr += 4;
+            }
+        }
+        *dstPtr = 0;
+    }
+
     // Set document length
     _response._document_length = _response._contents.length();
 
diff -ur htdig-3.2.0b6.orig/htnet/HtHTTP.h htdig-3.2.0b6/htnet/HtHTTP.h
--- htdig-3.2.0b6.orig/htnet/HtHTTP.h	2004-05-28 15:15:23.000000000 +0200
+++ htdig-3.2.0b6/htnet/HtHTTP.h	2005-04-27 23:25:43.000000000 +0200
@@ -316,6 +316,7 @@
    int      	_bytes_read;        // Bytes read
    URL		_url;               // URL to retrieve
    URL		_referer;	    // Referring URL
+   int		_needUTF8Convert;   // Flag for simple UTF-8 convert
 
    String      _accept_language;    // accept-language directive