From pe@iki.fi Tue Aug 5 17:00:47 1997 Date: Tue, 5 Aug 1997 17:00:40 +0300 (EEST) From: Pasi Eronen To: HTDig mailing list Subject: Non-IP virtual hosts support Hi! I've added support for non-IP virtual hosts to HTDig (virtual hosts with different IPs should work already). These patches add a new configuration file option: primary_hostnames: primary1[=alias1,...] If a host name is found in this list, the given primary name is used when "normalizing" the URL. If the host name is not found, it uses the "old-style" IP lookup. For example, if you are running non-IP virtual hosts www.firm.com and www.foo.org, simply specify: primary_hostnames: www.firm.com www.foo.org The form "primary1=alias1,..." is mainly useful if the old-style IP lookup doesn't work for some other reason than non-IP virtual hosts. Since the patches are somewhat big, I won't post them here. Instead, they're available from . You probably must get the other patches on that page, too, except the getdate patch (I've received reports that it doesn't work on all platforms -- I'll fix it when I get more information). PLEASE NOTE THAT THESE PATCHES ARE EXPERIMENTAL! I'm not running non-IP virtual hosts myself, so the testing has been very limited. Comments are most welcome. Best regards, Pasi --- Pasi Eronen , +358-50-5123499 *** ../htdig-3.0.8b1/htcommon//defaults.cc Tue Aug 5 15:23:46 1997 --- htcommon/defaults.cc Tue Aug 5 15:39:26 1997 *************** *** 86,89 **** --- 86,90 ---- {"page_list_header", "
Pages:
"}, {"prev_page_text", "[prev]"}, + {"primary_hostnames", ""}, {"remove_bad_urls", "true"}, {"robotstxt_name", "htdig"}, *** ../htdig-3.0.8b1/htdig/Document.cc Tue Aug 5 15:23:46 1997 --- htdig/Document.cc Tue Aug 5 15:40:33 1997 *************** *** 61,65 **** { proxy = new URL(proxyURL); - proxy->normalize(); } --- 61,64 ---- *** ../htdig-3.0.8b1/htdig/Retriever.h Tue Aug 5 15:23:47 1997 --- htdig/Retriever.h Tue Aug 5 15:31:23 1997 *************** *** 27,30 **** --- 27,31 ---- class Document; class URLRef; + class Server; class Retriever *************** *** 114,117 **** --- 115,119 ---- // int Need2Get(char *url); + Server * GetServer(URL &url); DocumentRef * GetRef(char *url); int IsValidURL(char *url); *************** *** 118,121 **** --- 120,124 ---- String * IsLocal(char *url); String * IsLocalUser(char *url); + void NormalizeURL(URL &url); void RetrievedDocument(Document &, char *url, DocumentRef *ref); void parse_url(URLRef &urlRef); *** htdig/Retriever.cc.orig Tue Aug 5 15:23:47 1997 --- htdig/Retriever.cc Tue Aug 5 16:15:16 1997 *************** *** 27,30 **** --- 27,36 ---- #include "Document.h" #include + + #include + #include + #include + #include + #include #include *************** *** 104,114 **** { URL u(tokens[i]); ! sig = u.signature(); ! server = (Server *) servers[sig]; ! if (!server) ! { ! server = new Server(u.host(), u.port()); ! servers.Add(sig, server); ! } server->push(u.get(), 0, 0); sig = u.get(); --- 110,114 ---- { URL u(tokens[i]); ! server = GetServer(u); server->push(u.get(), 0, 0); sig = u.get(); *************** *** 556,559 **** --- 556,561 ---- String *local = new String(*path, l); *local += &url[prefix->length()]; + if (local->last() == '/') + *local += "index.html"; return local; } *************** *** 661,664 **** --- 663,790 ---- } + + //***************************************************************************** + // void Retriever::NormalizeURL(URL &ur) + // Normalizes the URL by converting host name to "primary name" + // and removing extra "/index.html" if it's there. + // You can specify the primary names with "virtual_hosts" + // option, or with automatic IP lookup (the first name specified is used). + // + void + Retriever::NormalizeURL(URL &url) + { + static Dictionary *hostnames = 0; + + // + // Initialize list of primary hostnames. + // The list is given in format "primary1[=alias1,...] ..." + // + if (!hostnames) + { + hostnames = new Dictionary; + + String tmp = config["primary_hostnames"]; + tmp.lowercase(); + char *primary = strtok(tmp, " \t"); + while (primary) + { + char *aliases = strchr(primary, '='); + if (aliases) + { + *aliases++ = '\0'; + StringList alias_list(aliases, ','); + String *alias; + alias_list.Start_Get(); + while (alias = (String*) alias_list.Get_Next()) + { + hostnames->Add(alias->get(), new String(primary)); + } + } + hostnames->Add(primary, new String(primary)); + primary = strtok(0, " \t"); + } + } + + if (strcmp(url.service(), "http") != 0) + return; + + // Convert the host name to the "primary name" + String host = url.host(); + host.lowercase(); + String *real_host = (String*) hostnames->Find(host); + if (!real_host) + { + // + // Convert a hostname to an IP address + // + static Dictionary hostbyname; + unsigned long addr; + struct hostent *hp; + + String *ip = (String *) hostbyname[host]; + if (ip) + { + memcpy((char *) &addr, ip->get(), ip->length()); + } + else + { + addr = inet_addr(host.get()); + if (addr == 0xffffffff) + { + hp = gethostbyname(host.get()); + if (hp == NULL) + { + return; + } + memcpy((char *)&addr, (char *)hp->h_addr, hp->h_length); + ip = new String((char *) &addr, hp->h_length); + hostbyname.Add(host, ip); + } + } + static Dictionary machines; + String key; + key << int(addr); + real_host = (String *) machines[key]; + if (!real_host) + { + String *tmp = new String(host); + machines.Add(key, tmp); + real_host = tmp; + } + } + url.host(*real_host); + + // Remove the index.html from the end of the URL path. + char *index = strrchr(url.path(), '/'); + if (index && (strcmp(index, "/index.html") == 0)) + { + String tmp = url.path(); + tmp.chop(10); + url.path(tmp); + } + url.build(); + } + + + //***************************************************************************** + // Server* Retriever::GetServer(URL &url) + // Returns the server object to use with the given URL, + // or create a new one if necessary. + Server* + Retriever::GetServer(URL &url) + { + String key = strlen(url.host())+10; + key = url.host(); + key << ":" << url.port(); + + Server *server = (Server*) servers[key]; + if (!server) + { + server = new Server(url.host(), url.port()); + servers.Add(key, server); + } + return server; + } + //***************************************************************************** *************** *** 744,748 **** { DocumentRef *ref; - Server *server; if (debug > 2) --- 870,873 ---- *************** *** 775,779 **** } ! url.normalize(); if (IsValidURL(url.get())) { --- 900,904 ---- } ! NormalizeURL(url); if (IsValidURL(url.get())) { *************** *** 806,819 **** if (debug > 1) cout << "\n pushing " << url.get() << endl; ! char *sig = url.signature(); ! server = (Server *) servers[sig]; ! if (!server) ! { ! // ! // Hadn't seen this server, yet. Register it ! // ! server = new Server(url.host(), url.port()); ! servers.Add(sig, server); ! } server->push(url.get(), ref->DocHopCount(), base->get()); --- 931,935 ---- if (debug > 1) cout << "\n pushing " << url.get() << endl; ! Server *server = GetServer(url); server->push(url.get(), ref->DocHopCount(), base->get()); *************** *** 885,889 **** } ! url.normalize(); if (IsValidURL(url.get())) { --- 1001,1005 ---- } ! NormalizeURL(url); if (IsValidURL(url.get())) { *************** *** 929,942 **** if (debug > 1) cout << " pushing " << url.get() << endl; ! char *sig = url.signature(); ! Server *server = (Server *) servers[sig]; ! if (!server) ! { ! // ! // Hadn't seen this server, yet. Register it ! // ! server = new Server(url.host(), url.port()); ! servers.Add(sig, server); ! } server->push(url.get(), ref->DocHopCount(), base->get()); --- 1045,1049 ---- if (debug > 1) cout << " pushing " << url.get() << endl; ! Server *server = GetServer(url); server->push(url.get(), ref->DocHopCount(), base->get()); *** ../htdig-3.0.8b1/htlib/URL.h Mon Mar 24 06:35:00 1997 --- htlib/URL.h Tue Aug 5 14:26:44 1997 *************** *** 30,33 **** --- 30,34 ---- void parse(char *url); + void build(); char *host() {return _host;} *************** *** 38,42 **** void service(char *s) {_service = s;} char *path() {return _path;} ! void path(char *p); int hopcount() {return _hopcount;} void hopcount(int h) {_hopcount = h;} --- 39,43 ---- void service(char *s) {_service = s;} char *path() {return _path;} ! void path(char *p) {_path = p;} int hopcount() {return _hopcount;} void hopcount(int h) {_hopcount = h;} *************** *** 44,49 **** char *get() {return _url;} void dump(); - void normalize(); - char *signature(); private: --- 45,48 ---- *************** *** 55,59 **** int _normal; int _hopcount; - String _signature; void removeIndex(String &); --- 54,57 ---- *** ../htdig-3.0.8b1/htlib/URL.cc Sun Apr 27 17:43:37 1997 --- htlib/URL.cc Tue Aug 5 15:26:59 1997 *************** *** 27,34 **** #include #include - #include - #include - #include - #include #include --- 27,30 ---- *************** *** 56,60 **** _path = nurl._path; _normal = nurl._normal; - _signature = nurl._signature; _hopcount = nurl._hopcount; } --- 52,55 ---- *************** *** 131,135 **** _path = parent._path; _normal = parent._normal; - _signature = parent._signature; _hopcount = parent._hopcount; return; --- 126,129 ---- *************** *** 232,242 **** // Build the url. (Note, the host name has NOT been normalized!) // ! _url = _service; ! _url << ":"; ! if (_host.length()) ! _url << "//" << _host; ! if (_port != 80 && strcmp(_service, "http") == 0) ! _url << ':' << _port; ! _url << _path; } --- 226,230 ---- // Build the url. (Note, the host name has NOT been normalized!) // ! build(); } *************** *** 319,374 **** // Build the url. (Note, the host name has NOT been normalized!) // ! _url = _service; ! _url << "://" << _host; ! if (_port != 80) ! _url << ':' << _port; ! _url << _path; ! } ! ! ! //***************************************************************************** ! // void URL::dump() ! // ! void URL::dump() ! { ! printf("service = '%s'\n", _service.get()); ! printf("host = '%s'\n", _host.get()); ! printf("port = %d\n", _port); ! printf("url = '%s'\n", _url.get()); ! printf("path = '%s'\n", _path.get()); ! } ! ! ! //***************************************************************************** ! // void URL::path(char *newpath) ! // ! void URL::path(char *newpath) ! { ! _path = newpath; ! _url = _service; ! _url << "://" << _host; ! if (_port != 80) ! _url << ':' << _port; ! _url << _path; ! } ! ! ! //***************************************************************************** ! // void URL::removeIndex(String &path) ! // Attempt to remove the index.html from the end of a URL path. ! // This needs to be done to normalize the paths and make .../ the ! // same as .../index.html ! // ! void URL::removeIndex(String &path) ! { ! if (path.length() == 0) ! return; ! ! char *slash = strrchr(path, '/'); ! if (!slash) ! return; ! ! if (strcmp(slash, "/index.html") == 0) ! path.chop(10); } --- 307,311 ---- // Build the url. (Note, the host name has NOT been normalized!) // ! build(); } *************** *** 375,438 **** //***************************************************************************** ! // void URL::normalize() ! // Make sure that http URLs are always in the same format. // ! void URL::normalize() { - static int hits = 0, misses = 0; - - if (_service.length() == 0 || _normal) - return; - - if (strcmp(_service, "http") != 0) - return; - - removeIndex(_path); - - // - // Convert a hostname to an IP address - // - _host.lowercase(); - static Dictionary hostbyname; - unsigned long addr; - struct hostent *hp; - - String *ip = (String *) hostbyname[_host]; - if (ip) - { - memcpy((char *) &addr, ip->get(), ip->length()); - hits++; - } - else - { - // printf("Doing lookup on %s\n", _host.get()); - - addr = inet_addr(_host.get()); - if (addr == 0xffffffff) - { - hp = gethostbyname(_host.get()); - if (hp == NULL) - { - return; - } - memcpy((char *)&addr, (char *)hp->h_addr, hp->h_length); - ip = new String((char *) &addr, hp->h_length); - hostbyname.Add(_host, ip); - misses++; - } - } - - static Dictionary machines; - String key; - key << int(addr); - String *realname = (String *) machines[key]; - if (realname) - _host = realname->get(); - else - machines.Add(key, new String(_host)); - - // - // Reconstruct the url - // _url = _service; _url << ":"; --- 312,320 ---- //***************************************************************************** ! // void URL::build() ! // Rebuilds the URL string from the component parts. // ! void URL::build() { _url = _service; _url << ":"; *************** *** 442,447 **** _url << ':' << _port; _url << _path; - _normal = 1; - _signature = 0; } --- 324,327 ---- *************** *** 448,466 **** //***************************************************************************** ! // char *URL::signature() ! // Return a string which uniquely identifies the server the current ! // URL is refering to. ! // This is just the string containing the host and port. // ! char *URL::signature() { ! if (_signature.length()) ! return _signature; ! ! if (!_normal) ! normalize(); ! _signature = _host; ! _signature << ':' << _port; ! return _signature; } - --- 328,339 ---- //***************************************************************************** ! // void URL::dump() // ! void URL::dump() { ! printf("service = '%s'\n", _service.get()); ! printf("host = '%s'\n", _host.get()); ! printf("port = %d\n", _port); ! printf("url = '%s'\n", _url.get()); ! printf("path = '%s'\n", _path.get()); }