PC Online 1997 December

home *** CD-ROM | disk | FTP | other *** search

/ PC Online 1997 December / PCO1297.ISO / FilesBBS / FREI / JAVA3.ARJ / JAVA3.ZIP / JAVA / ClientSearchGenerator.java < prev next >

Wrap

Java Source | 1997-11-06 | 14.8 KB | 461 lines

// ClientSearchGenerator Version 1.0 // Die Suchmaschine fⁿr Jedermann, Generator // Autor: Gerhard Schild, 1997 // Fⁿr: PC-ONLiNE import java.applet.*; import java.awt.*; import java.util.*; import java.io.*; import java.net.*; import java.util.zip.*; // Dokumenten-Auswerter class DocumentParser { SearchEngineSpider spider; InputStream in; SearchEngineGenerator g; String section; URL url; int pos, unget; // Auswertung mit Spider public DocumentParser(SearchEngineGenerator g, InputStream in, SearchEngineSpider s, URL url) { this.g=g; this.in=in; this.spider=s; this.url=url; unget=-1; } // Auswertung ohne Spider public DocumentParser(SearchEngineGenerator g, InputStream in) { this.g=g; this.in=in; this.url=null; this.spider=null; unget=-1; } // NΣchstes Zeichen aus dem Eingabestream holen int get() throws IOException { int r; if(unget>=0) { r=unget; unget=-1; } else r=in.read(); return r; } // Zeichen analysieren static boolean isWS(int c) { return c==' ' || c=='\t' || c=='\r' || c=='\n'; } static boolean isAlpha(int c) { return Character.isLetterOrDigit((char)c); } // Text-Abschnitt holen: // Liefert entweder einen Tag oder einen Textbereich zwischen zwei Tags // Whitespace wird dabei auf je ein Leerzeichen reduziert String getSection() throws IOException { StringBuffer b=new StringBuffer(); pos=0; int c; boolean tag, quote=false, ws=false; while( (c=get())>=0 && isWS(c) ); // Fⁿhrenden WS ⁿberspringen tag=c=='<'; if(c>=0) { unget=c; while( (c=get())>=0 ) { if(tag) { if(quote) { // "..."? b.append((char)c); if(c=='\"') quote=ws=false; continue; } if(c=='\"') { b.append('\"'); quote=true; continue; } if(c=='>') { b.append('>'); break; } } else { if(c=='<') { unget=c; break; } } if(isWS(c)) { // Leerraum verkⁿrzen if(!ws) { b.append(' '); ws=true; } continue; } ws=false; b.append((char)c); } } return section=HTMLcvt.fromHTML(b.toString()); } // NΣchstes Zeichen aus dem Textabschnitt holen int getChar() { if(pos>=section.length()) return -1; int c=section.charAt(pos++); return c; } // Extrahiert das nΣchste Wort aus dem aktuellen Textabschnitt String getWord() { StringBuffer w=new StringBuffer(); int c; while( (c=getChar())>=0 ) if(isAlpha(c)) break; if(c>=0) { for(;;) { w.append((char)c); if((c=getChar())<0) break; if(!isAlpha(c)) break; } } return w.toString(); } // Spider: Link verfolgen // <A href="..."> oder "<FRAME SRC="..."> oder "<AREA href="..."> auswerten void spiderFollow(String prefix, String tag, String uptag) { int n=uptag.indexOf(prefix); if(n<0) return; n+=prefix.length(); try { if(tag.charAt(n)==' ') ++n; // Evtl. Whitespace if(tag.charAt(n++)!='=') return; // "=" fehlt if(tag.charAt(n)==' ') ++n; // Evtl. Whitespace StringBuffer link=new StringBuffer(); if(tag.charAt(n)=='\"') { // Link in Anfⁿhrungszeichen? while(tag.charAt(++n)!='\"') link.append((char)tag.charAt(n)); } else { for(;;) { int ch=tag.charAt(n++); if(ch==' ' || ch=='>') break; link.append((char)ch); } } spider.follow(url, link.toString()); } catch(Exception e) { } } // Dokument-Auswertung starten public void parse(String document) throws IOException { int doc=g.addDocument(document, ""); int words=0, back=0; boolean inTitle=false, init=true; StringBuffer title=new StringBuffer(); String titleString=null; for(String s; (s=getSection()).length()>0; ) { if(init) { System.out.print("\r * "+(titleString==null ? "" : titleString+": ")+g.getDocs()+"/"); back=0; init=false; } if(s.startsWith("<")) { // TAG? String tag=s.toUpperCase(); if(tag.startsWith("<TITLE")) inTitle=true; if(tag.startsWith("</TITLE")) { inTitle=false; titleString=title.toString(); init=true; } if(tag.startsWith("<A ")) { // <A href="..."> if(spider!=null) spiderFollow("HREF", s, tag); } if(tag.startsWith("<FRAME ")) { // <FRAME src="..."> if(spider!=null) spiderFollow("SRC", s, tag); } if(tag.startsWith("<AREA ")) { // <AREA href="..."> if(spider!=null) spiderFollow("HREF", s, tag); } } else { if(inTitle) { if(title.length()>0) title.append(' '); title.append(section); } for(String t; (t=getWord()).length()>0; ) { g.addWord(doc, t.toUpperCase()); String s2=""+(++words)+"/"+g.getSize(); while(back-->0) System.out.print('\b'); System.out.print(s2); back=s2.length(); } } } if(titleString!=null) g.addDocument(document, titleString); System.out.println(""); } } // Spider (verfolgt alle Links) class SearchEngineSpider { URL home; // Basisadresse String homes; // dto. als String Properties done, todo; // Dokumentenlisten SearchEngineSpider(URL home, String doc) { this.home=home; done=new Properties(); todo=new Properties(); homes=home.toString(); follow(home, doc); } // Link verfolgen public void follow(URL current, String doc) { int n=doc.lastIndexOf('#'); // Marke? if(n>=0) doc=doc.substring(0, n); try { URL newURL=new URL(current, doc); String newdoc=newURL.toString(); if(done.get(newdoc)==null) // Noch nicht erledigt? todo.put(newdoc, ""); // Dann vormerken } catch(Exception e) { } } // Spider starten void run(SearchEngineGenerator g) { Enumeration n; // NΣchsten Eintrag aus der todo-Liste holen while( (n=todo.keys()).hasMoreElements() ) { String key=(String)n.nextElement(); todo.remove(key); done.put(key, ""); // Links ignorieren, die nicht unterhalb der Basis-URL liegen try { URL doc=new URL(key); String docs=doc.toString(), updocs=docs.toUpperCase(); if(!docs.startsWith(homes)) { System.out.println("Ignoriere externen Link "+docs); } else if(! (updocs.endsWith("/") || updocs.endsWith(".HTM") || updocs.endsWith(".TXT") || updocs.endsWith(".HTML") || updocs.endsWith(".SHTML")) ) { System.out.println("Ignoriere Link mit unbekannter Erweiterung: "+docs); } else { System.out.println("Indiziere "+docs); DocumentParser p=new DocumentParser(g, doc.openStream(), this, doc); p.parse(key.substring(homes.length())); } } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } } } } // Generator fⁿr den Suchindex class SearchEngineGenerator { Properties tab; // Tabelle der Schlⁿsselworte Hashtable docs; // Tabelle der Dokumente Properties exclude; // Ausschlu▀liste int nextDoc; SearchEngineGenerator() { nextDoc=0; tab=new Properties(); docs=new Hashtable(); exclude=new Properties(); // Ausschlu▀liste laden try { InputStream in=new FileInputStream("exclude.dat"); exclude.load(in); in.close(); } catch(IOException e) { System.err.println("Ausschlussliste exclude.dat konnte nicht geladen werden"); } } void clear() { tab.clear(); docs.clear(); nextDoc=0; } // Tabelle speichern void save(OutputStream out) throws IOException { tab.save(out, "SearchEngine"); out.close(); } void saveZIP(OutputStream out) throws IOException { save(new DeflaterOutputStream(out, new Deflater(Deflater.BEST_COMPRESSION))); } // Tabelle laden void load(InputStream in) throws IOException { clear(); tab.load(in); in.close(); for(;;) { String s=tab.getProperty("."+IntToString(nextDoc)); if(s==null) break; SearchEngineResult r=new SearchEngineResult(s); addDocument(r.document, r.caption); } } void loadZIP(InputStream in) throws IOException { load(new InflaterInputStream(in)); } public int getSize() { return tab.size(); } public int getDocs() { return docs.size(); } public String getStatus() { return ""+getDocs()+" Dokumente/"+getSize()+" Woerter"; } // Dokument speichern. Liefert Dokumenten-Index zurⁿck public int addDocument(String document, String caption) { Integer b=(Integer)docs.get(document); if(b==null) b=new Integer(nextDoc++); docs.put(document, b); tab.put("."+IntToString(b.intValue()), document+","+HTMLcvt.toHTML(caption)); return b.intValue(); } // Dokumentenindex (de-)kodieren static final String encoding="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; static final int enclen=62; static public char IntToChar(int i) { return encoding.charAt(i); } static public int CharToInt(char c) { return encoding.indexOf((int)c); } public static int StringToInt(String s, int index) { return CharToInt(s.charAt(index))*enclen+CharToInt(s.charAt(index+1)); } public static String IntToString(int i) { return ""+IntToChar(i/enclen)+IntToChar(i%enclen); } // Word im Suchindex speichern public void addWord(int document, String word) { if(exclude!=null && exclude.getProperty(word)!=null) return; if(word.length()<2) return; // Eintrag fⁿr das Wort suchen String t=tab.getProperty(word); if(t==null) { // Neues Wort tab.put(word, IntToString(document)); } else { // Wort schon da: Testen, ob das Dokument schon vorhanden ist boolean found=false; for(int i=0; i<t.length(); i+=2) { if(document==StringToInt(t, i)) found=true; } if(!found) { tab.put(word, t+IntToString(document)); } } } } // Generator-Hauptprogramm public class ClientSearchGenerator { static SearchEngineGenerator g; static boolean zipAvailable=false; static void usage() { System.out.println("Gueltige Argumente:"); System.out.println(" CLEAR Suchindex loeschen"); System.out.println(" ADD home file [url] Datei indizieren"); System.out.println(" ADDW home file Mehrere Dateien indiziern (mit Wildcards)"); System.out.println(" ADDWS home file Mehrere Dateien indiziern (mit Wildcards "); System.out.println(" und Unterverzeichnissen)"); System.out.println(" SPIDER home file Dokument und alle Verweise indizieren"); System.exit(1); } // Tabelle laden static void load() { // Aktuellen Stand laden System.out.println("Lade "+ClientSearch.rawfile); try { InputStream in=new FileInputStream(ClientSearch.rawfile); g.load(in); in.close(); } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } System.out.println(g.getStatus()); } // Tabelle speichern static void save() { try { System.out.println("Speichere "+ClientSearch.rawfile); g.save(new FileOutputStream(ClientSearch.rawfile)); } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } if(zipAvailable) { try { System.out.println("Speichere "+ClientSearch.zipfile); g.saveZIP(new FileOutputStream(ClientSearch.zipfile)); } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } } else System.err.println("WARNUNG: ZIP-Kompression nicht verfuegbar. "+ ClientSearch.zipfile+" wurde nicht erzeugt."); System.out.println(g.getStatus()); } // Verzeichnis indizieren (rekursiv) static void addDir(File dir, String prefix, FilenameFilter filter, boolean sub) { System.out.println("Durchsuche "+dir.toString()); String[] files=dir.list(filter); if(files!=null) for(int i=0; i<files.length; ++i) { System.out.println("Indiziere "+prefix+files[i]); try { File file=new File(dir, files[i]); FileInputStream in=new FileInputStream(file); DocumentParser p=new DocumentParser(g, in); p.parse(prefix+files[i]); } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } } // Unterverzeichnisse durchsuchen if(sub) { files=dir.list(); if(files!=null) for(int i=0; i<files.length; ++i) { File subdir=new File(dir, files[i]); if(subdir.isDirectory()) addDir(subdir, prefix+files[i]+File.separator, filter, sub); } } } // Programmeintritt: Argumente auswerten public static void main(String args[]) { if(args.length<1) usage(); g=new SearchEngineGenerator(); try { Class c1=Class.forName("java.util.zip.InflaterInputStream"); Class c2=Class.forName("java.util.zip.DeflaterOutputStream"); zipAvailable=true; } catch(Exception e) { } if(args[0].equalsIgnoreCase("CLEAR")) { save(); } else if(args[0].equalsIgnoreCase("ADD")) { if(args.length<3||args.length>4) usage(); load(); try { URL home=new URL(args[1]); URL doc=new URL(home, args[2]); System.out.println("Indiziere "+doc.toString()); DocumentParser p=new DocumentParser(g, doc.openStream()); p.parse(args.length>=4 ? args[3] : args[2]); } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } save(); } else if(args[0].equalsIgnoreCase("ADDW") ||args[0].equalsIgnoreCase("ADDWS")) { boolean sub=args[0].equalsIgnoreCase("ADDWS"); if(args.length!=3) usage(); load(); addDir(new File(args[1]), new String(""), new WildcardFilter(args[2]), sub); save(); } else if(args[0].equalsIgnoreCase("SPIDER")) { if(args.length!=3) usage(); load(); try { URL home=new URL(args[1]); SearchEngineSpider spider=new SearchEngineSpider(home, args[2]); spider.run(g); } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } save(); } else usage(); } }