home *** CD-ROM | disk | FTP | other *** search
Java Source | 1997-11-06 | 14.8 KB | 461 lines |
- // ClientSearchGenerator Version 1.0
- // Die Suchmaschine fⁿr Jedermann, Generator
- // Autor: Gerhard Schild, 1997
- // Fⁿr: PC-ONLiNE
-
- import java.applet.*;
- import java.awt.*;
- import java.util.*;
- import java.io.*;
- import java.net.*;
- import java.util.zip.*;
-
- // Dokumenten-Auswerter
- class DocumentParser {
- SearchEngineSpider spider;
- InputStream in;
- SearchEngineGenerator g;
- String section;
- URL url;
- int pos, unget;
-
- // Auswertung mit Spider
- public DocumentParser(SearchEngineGenerator g, InputStream in, SearchEngineSpider s, URL url) {
- this.g=g;
- this.in=in;
- this.spider=s;
- this.url=url;
- unget=-1;
- }
- // Auswertung ohne Spider
- public DocumentParser(SearchEngineGenerator g, InputStream in) {
- this.g=g;
- this.in=in;
- this.url=null;
- this.spider=null;
- unget=-1;
- }
- // NΣchstes Zeichen aus dem Eingabestream holen
- int get() throws IOException {
- int r;
- if(unget>=0) {
- r=unget; unget=-1;
- }
- else
- r=in.read();
- return r;
- }
- // Zeichen analysieren
- static boolean isWS(int c) { return c==' ' || c=='\t' || c=='\r' || c=='\n'; }
- static boolean isAlpha(int c) { return Character.isLetterOrDigit((char)c); }
- // Text-Abschnitt holen:
- // Liefert entweder einen Tag oder einen Textbereich zwischen zwei Tags
- // Whitespace wird dabei auf je ein Leerzeichen reduziert
- String getSection() throws IOException {
- StringBuffer b=new StringBuffer();
- pos=0;
- int c;
- boolean tag, quote=false, ws=false;
-
- while( (c=get())>=0 && isWS(c) ); // Fⁿhrenden WS ⁿberspringen
- tag=c=='<';
- if(c>=0) {
- unget=c;
- while( (c=get())>=0 ) {
- if(tag) {
- if(quote) { // "..."?
- b.append((char)c);
- if(c=='\"') quote=ws=false;
- continue;
- }
- if(c=='\"') { b.append('\"'); quote=true; continue; }
- if(c=='>') { b.append('>'); break; }
- }
- else {
- if(c=='<') { unget=c; break; }
- }
- if(isWS(c)) { // Leerraum verkⁿrzen
- if(!ws) { b.append(' '); ws=true; }
- continue;
- }
- ws=false;
- b.append((char)c);
- }
- }
- return section=HTMLcvt.fromHTML(b.toString());
- }
- // NΣchstes Zeichen aus dem Textabschnitt holen
- int getChar() {
- if(pos>=section.length()) return -1;
- int c=section.charAt(pos++);
- return c;
- }
- // Extrahiert das nΣchste Wort aus dem aktuellen Textabschnitt
- String getWord() {
- StringBuffer w=new StringBuffer();
- int c;
- while( (c=getChar())>=0 )
- if(isAlpha(c))
- break;
- if(c>=0) {
- for(;;) {
- w.append((char)c);
- if((c=getChar())<0) break;
- if(!isAlpha(c)) break;
- }
- }
- return w.toString();
- }
- // Spider: Link verfolgen
- // <A href="..."> oder "<FRAME SRC="..."> oder "<AREA href="..."> auswerten
- void spiderFollow(String prefix, String tag, String uptag) {
- int n=uptag.indexOf(prefix);
- if(n<0) return;
- n+=prefix.length();
- try {
- if(tag.charAt(n)==' ') ++n; // Evtl. Whitespace
- if(tag.charAt(n++)!='=') return; // "=" fehlt
- if(tag.charAt(n)==' ') ++n; // Evtl. Whitespace
- StringBuffer link=new StringBuffer();
- if(tag.charAt(n)=='\"') { // Link in Anfⁿhrungszeichen?
- while(tag.charAt(++n)!='\"')
- link.append((char)tag.charAt(n));
- }
- else {
- for(;;) {
- int ch=tag.charAt(n++);
- if(ch==' ' || ch=='>') break;
- link.append((char)ch);
- }
- }
- spider.follow(url, link.toString());
- }
- catch(Exception e) {
- }
- }
- // Dokument-Auswertung starten
- public void parse(String document) throws IOException {
- int doc=g.addDocument(document, "");
- int words=0, back=0;
- boolean inTitle=false, init=true;
- StringBuffer title=new StringBuffer();
- String titleString=null;
-
- for(String s; (s=getSection()).length()>0; ) {
- if(init) {
- System.out.print("\r * "+(titleString==null ? "" : titleString+": ")+g.getDocs()+"/");
- back=0;
- init=false;
- }
- if(s.startsWith("<")) { // TAG?
- String tag=s.toUpperCase();
- if(tag.startsWith("<TITLE")) inTitle=true;
- if(tag.startsWith("</TITLE")) {
- inTitle=false;
- titleString=title.toString();
- init=true;
- }
- if(tag.startsWith("<A ")) { // <A href="...">
- if(spider!=null) spiderFollow("HREF", s, tag);
- }
- if(tag.startsWith("<FRAME ")) { // <FRAME src="...">
- if(spider!=null) spiderFollow("SRC", s, tag);
- }
- if(tag.startsWith("<AREA ")) { // <AREA href="...">
- if(spider!=null) spiderFollow("HREF", s, tag);
- }
- }
- else {
- if(inTitle) {
- if(title.length()>0) title.append(' ');
- title.append(section);
- }
- for(String t; (t=getWord()).length()>0; ) {
- g.addWord(doc, t.toUpperCase());
- String s2=""+(++words)+"/"+g.getSize();
- while(back-->0) System.out.print('\b');
- System.out.print(s2);
- back=s2.length();
- }
- }
- }
- if(titleString!=null) g.addDocument(document, titleString);
- System.out.println("");
- }
- }
-
- // Spider (verfolgt alle Links)
- class SearchEngineSpider {
- URL home; // Basisadresse
- String homes; // dto. als String
- Properties done, todo; // Dokumentenlisten
- SearchEngineSpider(URL home, String doc) {
- this.home=home;
- done=new Properties();
- todo=new Properties();
- homes=home.toString();
- follow(home, doc);
- }
- // Link verfolgen
- public void follow(URL current, String doc) {
- int n=doc.lastIndexOf('#'); // Marke?
- if(n>=0) doc=doc.substring(0, n);
- try {
- URL newURL=new URL(current, doc);
- String newdoc=newURL.toString();
- if(done.get(newdoc)==null) // Noch nicht erledigt?
- todo.put(newdoc, ""); // Dann vormerken
- } catch(Exception e) {
- }
- }
- // Spider starten
- void run(SearchEngineGenerator g) {
- Enumeration n;
- // NΣchsten Eintrag aus der todo-Liste holen
- while( (n=todo.keys()).hasMoreElements() ) {
- String key=(String)n.nextElement();
- todo.remove(key);
- done.put(key, "");
- // Links ignorieren, die nicht unterhalb der Basis-URL liegen
- try {
- URL doc=new URL(key);
- String docs=doc.toString(), updocs=docs.toUpperCase();
- if(!docs.startsWith(homes)) {
- System.out.println("Ignoriere externen Link "+docs);
- }
- else if(! (updocs.endsWith("/") || updocs.endsWith(".HTM") || updocs.endsWith(".TXT")
- || updocs.endsWith(".HTML") || updocs.endsWith(".SHTML")) ) {
- System.out.println("Ignoriere Link mit unbekannter Erweiterung: "+docs);
- }
- else {
- System.out.println("Indiziere "+docs);
- DocumentParser p=new DocumentParser(g, doc.openStream(), this, doc);
- p.parse(key.substring(homes.length()));
- }
- } catch(Exception e) {
- System.err.println("Exception: "+e.getMessage());
- }
- }
- }
- }
-
- // Generator fⁿr den Suchindex
- class SearchEngineGenerator {
- Properties tab; // Tabelle der Schlⁿsselworte
- Hashtable docs; // Tabelle der Dokumente
- Properties exclude; // Ausschlu▀liste
- int nextDoc;
-
- SearchEngineGenerator() {
- nextDoc=0;
- tab=new Properties();
- docs=new Hashtable();
- exclude=new Properties();
- // Ausschlu▀liste laden
- try {
- InputStream in=new FileInputStream("exclude.dat");
- exclude.load(in);
- in.close();
- } catch(IOException e) {
- System.err.println("Ausschlussliste exclude.dat konnte nicht geladen werden");
- }
- }
- void clear() { tab.clear(); docs.clear(); nextDoc=0; }
- // Tabelle speichern
- void save(OutputStream out) throws IOException {
- tab.save(out, "SearchEngine");
- out.close();
- }
- void saveZIP(OutputStream out) throws IOException {
- save(new DeflaterOutputStream(out, new Deflater(Deflater.BEST_COMPRESSION)));
- }
- // Tabelle laden
- void load(InputStream in) throws IOException {
- clear();
- tab.load(in);
- in.close();
- for(;;) {
- String s=tab.getProperty("."+IntToString(nextDoc));
- if(s==null) break;
- SearchEngineResult r=new SearchEngineResult(s);
- addDocument(r.document, r.caption);
- }
- }
- void loadZIP(InputStream in) throws IOException {
- load(new InflaterInputStream(in));
- }
- public int getSize() { return tab.size(); }
- public int getDocs() { return docs.size(); }
- public String getStatus() { return ""+getDocs()+" Dokumente/"+getSize()+" Woerter"; }
- // Dokument speichern. Liefert Dokumenten-Index zurⁿck
- public int addDocument(String document, String caption) {
- Integer b=(Integer)docs.get(document);
- if(b==null) b=new Integer(nextDoc++);
- docs.put(document, b);
- tab.put("."+IntToString(b.intValue()), document+","+HTMLcvt.toHTML(caption));
- return b.intValue();
- }
- // Dokumentenindex (de-)kodieren
- static final String encoding="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
- static final int enclen=62;
- static public char IntToChar(int i) { return encoding.charAt(i); }
- static public int CharToInt(char c) { return encoding.indexOf((int)c); }
- public static int StringToInt(String s, int index) {
- return CharToInt(s.charAt(index))*enclen+CharToInt(s.charAt(index+1));
- }
- public static String IntToString(int i) {
- return ""+IntToChar(i/enclen)+IntToChar(i%enclen);
- }
- // Word im Suchindex speichern
- public void addWord(int document, String word) {
- if(exclude!=null && exclude.getProperty(word)!=null) return;
- if(word.length()<2) return;
- // Eintrag fⁿr das Wort suchen
- String t=tab.getProperty(word);
- if(t==null) {
- // Neues Wort
- tab.put(word, IntToString(document));
- }
- else {
- // Wort schon da: Testen, ob das Dokument schon vorhanden ist
- boolean found=false;
- for(int i=0; i<t.length(); i+=2) {
- if(document==StringToInt(t, i))
- found=true;
- }
- if(!found) {
- tab.put(word, t+IntToString(document));
- }
- }
- }
- }
-
- // Generator-Hauptprogramm
- public class ClientSearchGenerator {
- static SearchEngineGenerator g;
- static boolean zipAvailable=false;
- static void usage() {
- System.out.println("Gueltige Argumente:");
- System.out.println(" CLEAR Suchindex loeschen");
- System.out.println(" ADD home file [url] Datei indizieren");
- System.out.println(" ADDW home file Mehrere Dateien indiziern (mit Wildcards)");
- System.out.println(" ADDWS home file Mehrere Dateien indiziern (mit Wildcards ");
- System.out.println(" und Unterverzeichnissen)");
- System.out.println(" SPIDER home file Dokument und alle Verweise indizieren");
- System.exit(1);
- }
- // Tabelle laden
- static void load() {
- // Aktuellen Stand laden
- System.out.println("Lade "+ClientSearch.rawfile);
- try {
- InputStream in=new FileInputStream(ClientSearch.rawfile);
- g.load(in);
- in.close();
- } catch(Exception e) {
- System.err.println("Exception: "+e.getMessage());
- }
- System.out.println(g.getStatus());
- }
- // Tabelle speichern
- static void save() {
- try {
- System.out.println("Speichere "+ClientSearch.rawfile);
- g.save(new FileOutputStream(ClientSearch.rawfile));
- } catch(Exception e) {
- System.err.println("Exception: "+e.getMessage());
- }
- if(zipAvailable) {
- try {
- System.out.println("Speichere "+ClientSearch.zipfile);
- g.saveZIP(new FileOutputStream(ClientSearch.zipfile));
- } catch(Exception e) {
- System.err.println("Exception: "+e.getMessage());
- }
- }
- else
- System.err.println("WARNUNG: ZIP-Kompression nicht verfuegbar. "+
- ClientSearch.zipfile+" wurde nicht erzeugt.");
- System.out.println(g.getStatus());
- }
- // Verzeichnis indizieren (rekursiv)
- static void addDir(File dir, String prefix, FilenameFilter filter, boolean sub) {
- System.out.println("Durchsuche "+dir.toString());
- String[] files=dir.list(filter);
- if(files!=null)
- for(int i=0; i<files.length; ++i) {
- System.out.println("Indiziere "+prefix+files[i]);
- try {
- File file=new File(dir, files[i]);
- FileInputStream in=new FileInputStream(file);
- DocumentParser p=new DocumentParser(g, in);
- p.parse(prefix+files[i]);
- } catch(Exception e) {
- System.err.println("Exception: "+e.getMessage());
- }
- }
- // Unterverzeichnisse durchsuchen
- if(sub) {
- files=dir.list();
- if(files!=null)
- for(int i=0; i<files.length; ++i) {
- File subdir=new File(dir, files[i]);
- if(subdir.isDirectory())
- addDir(subdir, prefix+files[i]+File.separator, filter, sub);
- }
- }
- }
- // Programmeintritt: Argumente auswerten
- public static void main(String args[]) {
- if(args.length<1) usage();
- g=new SearchEngineGenerator();
- try {
- Class c1=Class.forName("java.util.zip.InflaterInputStream");
- Class c2=Class.forName("java.util.zip.DeflaterOutputStream");
- zipAvailable=true;
- } catch(Exception e) {
- }
- if(args[0].equalsIgnoreCase("CLEAR")) {
- save();
- }
- else if(args[0].equalsIgnoreCase("ADD")) {
- if(args.length<3||args.length>4) usage();
- load();
- try {
- URL home=new URL(args[1]);
- URL doc=new URL(home, args[2]);
- System.out.println("Indiziere "+doc.toString());
- DocumentParser p=new DocumentParser(g, doc.openStream());
- p.parse(args.length>=4 ? args[3] : args[2]);
- } catch(Exception e) {
- System.err.println("Exception: "+e.getMessage());
- }
- save();
- }
- else if(args[0].equalsIgnoreCase("ADDW")
- ||args[0].equalsIgnoreCase("ADDWS")) {
- boolean sub=args[0].equalsIgnoreCase("ADDWS");
- if(args.length!=3) usage();
- load();
- addDir(new File(args[1]), new String(""), new WildcardFilter(args[2]), sub);
- save();
- }
- else if(args[0].equalsIgnoreCase("SPIDER")) {
- if(args.length!=3) usage();
- load();
- try {
- URL home=new URL(args[1]);
- SearchEngineSpider spider=new SearchEngineSpider(home, args[2]);
- spider.run(g);
- } catch(Exception e) {
- System.err.println("Exception: "+e.getMessage());
- }
- save();
- }
- else
- usage();
- }
- }
-
-
-