From hans-peter.nilsson@axis.com Wed Jan 13 23:54:01 1999 Date: Thu, 14 Jan 1999 03:50:46 +0100 From: Hans-Peter Nilsson To: htdig@sdsu.edu Subject: htdig: Patch: support META elements for external parsers. Here's an implementation of META elements for external parsers; 'm' was used for this. Nothing really new; most was stolen from HTML.cc (no, I could not find a good way to share that code within limits). Note that meta.html is not up-to-date (regardless of this). I did not fix that; I see it as a bug that can be fixed during the feature-freeze (schemes within schemes :-) Thu Jan 14 03:16:15 1999 Hans-Peter Nilsson * htdig/ExternalParser.cc (parse): Added support for 'm': meta element. * htdoc/attrs.html: Document it. Index: htdig/ExternalParser.cc =================================================================== RCS file: /opt/htdig/cvs/htdig3/htdig/ExternalParser.cc,v retrieving revision 1.4 diff -p -c -r1.4 ExternalParser.cc *** ExternalParser.cc 1998/12/06 18:46:59 1.4 --- ExternalParser.cc 1999/01/14 02:36:20 *************** static char RCSid[] = "$Id: ExternalPars *** 30,35 **** --- 30,36 ---- #include #include #include + #include static Dictionary *parsers = 0; extern String configFile; *************** ExternalParser::parse(Retriever &retriev *** 153,158 **** --- 154,162 ---- return; } + unsigned int minimum_word_length + = config.Value("minimum_word_length", 3); + String line; char *token1, *token2, *token3; URL url; *************** ExternalParser::parse(Retriever &retriev *** 209,214 **** --- 213,328 ---- token1 = strtok(0, "\t"); if (token1 != NULL) retriever.got_image(token1); + else + cerr<< "External parser error in line:"< + // says that the "name" attribute defaults to + // the http-equiv attribute if empty. + if (*name == '\0') + name = httpEquiv; + + if (*httpEquiv != '\0') + { + // tags that + // fly with any reasonable DTD out there + // + if (*name != '\0' && *content != '\0') + { + if (keywordsMatch.CompareWord(name)) + { + char *w = strtok(content, " ,\t\r"); + while (w) + { + if (strlen(w) >= minimum_word_length) + retriever.got_word(w, 1, 10); + w = strtok(0, " ,\t\r"); + } + } + else if (mystrcasecmp(name, "htdig-email") == 0) + { + retriever.got_meta_email(content); + } + else if (mystrcasecmp(name, "htdig-notification-date") == 0) + { + retriever.got_meta_notification(content); + } + else if (mystrcasecmp(name, "htdig-email-subject") == 0) + { + retriever.got_meta_subject(content); + } + else if (mystrcasecmp(name, "description") == 0 + && strlen(content) != 0) + { + // + // We need to do two things. First grab the description + // + String meta_dsc = content; + + if (meta_dsc.length() > max_meta_description_length) + meta_dsc = meta_dsc.sub(0, max_meta_description_length).get(); + if (debug > 1) + cout << "META Description: " << content << endl; + retriever.got_meta_dsc(meta_dsc); + + // + // Now add the words to the word list + // (slot 11 is the new slot for this) + // + char *w = strtok(content, " \t\r"); + while (w) + { + if (strlen(w) >= minimum_word_length) + retriever.got_word(w, 1, 11); + w = strtok(0, " \t\r"); + } + } + } + } else cerr<< "External parser error in line:"< The output consists of records, each record terminated ! with a newline. Each record is a series of non-empty tab ! separated fields. The first field is a single character that specifies the record type. The rest of the fields are determined by the record type. --- 1277,1285 ---- The external parser is to write information for htdig on its standard output.
The output consists of records, each record terminated ! with a newline. Each record is a series of (unless ! expressively allowed to be empty) non-empty tab-separated ! fields. The first field is a single character that specifies the record type. The rest of the fields are determined by the record type.
*************** *** 1467,1472 **** --- 1468,1504 ---- the document. + + + + + + + + + + + + +
+ m + + http-equiv + + The HTTP-EQUIV attribute of a META tag. + May be empty. +
+ name + + The NAME attribute of this META + tag. May be empty. +
+ contents + + The CONTENTS attribute of this META tag. + May be empty. +
brgds, H-P ---------------------------------------------------------------------- To unsubscribe from the htdig mailing list, send a message to htdig-request@sdsu.edu containing the single word "unsubscribe" in the body of the message.