PC Online 1997 December

home *** CD-ROM | disk | FTP | other *** search

/ PC Online 1997 December / PCO1297.ISO / FilesBBS / FREI / CLS.ARJ / CLS.ZIP / ClientSearchGenerator.java < prev next >

Wrap

Java Source | 1997-11-02 | 12.9 KB | 410 lines

import java.applet.*; import java.awt.*; import java.util.*; import java.io.*; import java.net.*; import java.util.zip.*; class DocumentParser { SearchEngineSpider spider; InputStream in; SearchEngineGenerator g; String section; URL url; int pos, unget; public DocumentParser(SearchEngineGenerator g, InputStream in, SearchEngineSpider s, URL url) { this.g=g; this.in=in; this.spider=s; this.url=url; unget=-1; } public DocumentParser(SearchEngineGenerator g, InputStream in) { this.g=g; this.in=in; this.url=null; this.spider=null; unget=-1; } int get() throws IOException { int r; if(unget>=0) { r=unget; unget=-1; } else r=in.read(); return r; } static boolean isWS(int c) { return Character.isWhitespace((char)c); } static boolean isAlpha(int c) { return Character.isLetterOrDigit((char)c); } // Liefert entweder einen Tag (komplett, Whitespace auf je ein Leerzeichen reduziert) // oder einen Textbereich zwischen zwei Tags String getSection() throws IOException { StringBuffer b=new StringBuffer(); pos=0; int c; boolean tag, quote=false, ws=false; while( (c=get())>=0 && isWS(c) ); // WS ⁿberspringen tag=c=='<'; if(c>=0) { unget=c; while( (c=get())>=0 ) { if(tag) { if(quote) { // "..."? b.append((char)c); if(c=='\"') quote=ws=false; continue; } if(c=='\"') { b.append('\"'); quote=true; continue; } if(c=='>') { b.append('>'); break; } } else { if(c=='<') { unget=c; break; } } if(isWS(c)) { // Leerraum verkⁿrzen if(!ws) { b.append(' '); ws=true; } continue; } ws=false; b.append((char)c); } } return section=HTMLcvt.fromHTML(b.toString()); } int getChar() { if(pos>=section.length()) return -1; int c=section.charAt(pos++); return c; } // Extrahiert das nΣchste Wort aus dem aktuellen Bereich String getWord() { StringBuffer w=new StringBuffer(); int c; while( (c=getChar())>=0 ) if(isAlpha(c)) break; if(c>=0) { for(;;) { w.append((char)c); if((c=getChar())<0) break; if(!isAlpha(c)) break; } } return w.toString(); } // <A href="..."> oder "<FRAME SRC="..."> auswerten void spiderFollow(String prefix, String tag, String uptag) { int n=uptag.indexOf(prefix); if(n<0) return; n+=prefix.length(); try { if(tag.charAt(n)==' ') ++n; // Evtl. Whitespace if(tag.charAt(n++)!='=') return; // "=" fehlt if(tag.charAt(n)==' ') ++n; // Evtl. Whitespace StringBuffer link=new StringBuffer(); if(tag.charAt(n)=='\"') { // Link in Anfⁿhrungszeichen? while(tag.charAt(++n)!='\"') link.append((char)tag.charAt(n)); } else { for(;;) { int ch=tag.charAt(n++); if(ch==' ' || ch=='>') break; link.append((char)ch); } } spider.follow(url, link.toString()); } catch(Exception e) { } } public void parse(String document) throws IOException { int doc=g.addDocument(document, ""); int words=0, back=0; boolean inTitle=false, init=true; StringBuffer title=new StringBuffer(); String titleString=null; for(String s; (s=getSection()).length()>0; ) { if(init) { System.out.print("\r * "+(titleString==null ? document : titleString)+": "+g.getDocs()+"/"); back=0; init=false; } if(s.startsWith("<")) { // TAG? String tag=s.toUpperCase(); if(tag.startsWith("<TITLE")) inTitle=true; if(tag.startsWith("</TITLE")) { inTitle=false; titleString=title.toString(); init=true; } if(tag.startsWith("<A")) { // <A href="..."> if(spider!=null) spiderFollow("HREF", s, tag); } if(tag.startsWith("<FRAME")) { // <FRAME src="..."> if(spider!=null) spiderFollow("SRC", s, tag); } } else { if(inTitle) { if(title.length()>0) title.append(' '); title.append(section); } for(String t; (t=getWord()).length()>0; ) { g.addWord(doc, t.toUpperCase()); String s2=""+(++words)+"/"+g.getSize(); while(back-->0) System.out.print('\b'); System.out.print(s2); back=s2.length(); } } } if(titleString!=null) g.addDocument(document, titleString); System.out.println(""); } }; class SearchEngineSpider { URL home; String homes; Properties done, todo; SearchEngineSpider(URL home, String doc) { this.home=home; done=new Properties(); todo=new Properties(); homes=home.toString(); follow(home, doc); } // Link verfolgen public void follow(URL current, String doc) { int n=doc.lastIndexOf('#'); // Marke? if(n>=0) doc=doc.substring(0, n); try { URL newURL=new URL(current, doc); String newdoc=newURL.toString(); if(done.get(newdoc)==null) // Noch nicht erledigt? todo.put(newdoc, ""); // Dann vormerken } catch(Exception e) { } } void run(SearchEngineGenerator g) { Enumeration n; while( (n=todo.keys()).hasMoreElements() ) { // NΣchsten Eintrag aus der todo-Liste holen String key=(String)n.nextElement(); todo.remove(key); done.put(key, ""); // Links ignorieren, die nicht unterhalb der Basis-URL liegen try { URL doc=new URL(key); String docs=doc.toString(), updocs=docs.toUpperCase(); if(!docs.startsWith(homes)) { System.out.println("Ignoriere externen Link "+docs); } else if(! (updocs.endsWith("/") || updocs.endsWith(".HTM") || updocs.endsWith(".HTML")) ) { System.out.println("Ignoriere Link mit unbekannter Erweiterung: "+docs); } else { System.out.println("Indiziere "+docs); DocumentParser p=new DocumentParser(g, doc.openStream(), this, doc); p.parse(key.substring(homes.length())); } } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } } } }; class SearchEngineGenerator { Properties tab; Hashtable docs; Properties exclude; int nextDoc; SearchEngineGenerator() { nextDoc=0; tab=new Properties(); docs=new Hashtable(); exclude=new Properties(); try { InputStream in=new FileInputStream("exclude.dat"); exclude.load(in); in.close(); } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } } void clear() { tab.clear(); docs.clear(); nextDoc=0; } void save(OutputStream out, boolean compress) throws Exception { if(compress) out=new DeflaterOutputStream(out, new Deflater(Deflater.BEST_COMPRESSION)); tab.save(out, "SearchEngine"); out.close(); } void load(InputStream in, boolean compress) throws Exception { clear(); if(compress) in=new InflaterInputStream(in); tab.load(in); in.close(); for(;;) { String s=tab.getProperty("."+IntToString(nextDoc)); if(s==null) break; SearchEngineResult r=new SearchEngineResult(s); addDocument(r.document, r.caption); } } public int getSize() { return tab.size(); } public int getDocs() { return docs.size(); } public String getStatus() { return ""+getDocs()+" docs/"+getSize()+" words"; } public int addDocument(String document, String caption) { Integer b=(Integer)docs.get(document); if(b==null) b=new Integer(nextDoc++); docs.put(document, b); tab.put("."+IntToString(b.intValue()), document+","+HTMLcvt.toHTML(caption)); return b.intValue(); } static final String encoding="0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; static final int enclen=62; static public char IntToChar(int i) { return encoding.charAt(i); } static public int CharToInt(char c) { return encoding.indexOf((int)c); } public static int StringToInt(String s, int index) { return CharToInt(s.charAt(index))*enclen+CharToInt(s.charAt(index+1)); } public static String IntToString(int i) { return ""+IntToChar(i/enclen)+IntToChar(i%enclen); } public void addWord(int document, String word) { if(exclude!=null && exclude.getProperty(word)!=null) return; if(word.length()<2) return; String t=tab.getProperty(word); if(t==null) { tab.put(word, IntToString(document)); } else { boolean found=false; for(int i=0; i<t.length(); i+=2) { if(document==StringToInt(t, i)) found=true; } if(!found) { tab.put(word, t+IntToString(document)); } } } } class ClientSearchGenerator { static SearchEngineGenerator g; static void usage() { System.out.println("Gⁿltige Argumente:"); System.out.println(" CLEAR Suchindex loeschen"); System.out.println(" ADD home file [url] Datei indizieren"); System.out.println(" ADDW home file Mehrere Dateien indiziern (mit Wildcards)"); System.out.println(" ADDWS home file Mehrere Dateien indiziern (mit Wildcards "); System.out.println(" und Unterverzeichnissen)"); System.out.println(" SPIDER home file Dokument und alle Verweise indizieren"); System.exit(1); } static void load() { // Aktuellen Stand laden System.out.println("Lade "+ClientSearch.rawfile); try { InputStream in=new FileInputStream(ClientSearch.rawfile); g.load(in, false); in.close(); } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } System.out.println(g.getStatus()); } static void save() { try { System.out.println("Speichere "+ClientSearch.rawfile); g.save(new FileOutputStream(ClientSearch.rawfile), false); } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } try { System.out.println("Speichere "+ClientSearch.zipfile); g.save(new FileOutputStream(ClientSearch.zipfile), true); } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } System.out.println(g.getStatus()); } static void addDir(File dir, String prefix, FilenameFilter filter, boolean sub) { System.out.println("Durchsuche "+dir.toString()); String[] files=dir.list(filter); if(files!=null) for(int i=0; i<files.length; ++i) { System.out.println("Indiziere "+prefix+files[i]); try { File file=new File(dir, files[i]); FileInputStream in=new FileInputStream(file); DocumentParser p=new DocumentParser(g, in); p.parse(prefix+files[i]); } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } } if(sub) { files=dir.list(); if(files!=null) for(int i=0; i<files.length; ++i) { File subdir=new File(dir, files[i]); if(subdir.isDirectory()) addDir(subdir, prefix+files[i]+File.separator, filter, sub); } } } public static void main(String args[]) { if(args.length<1) usage(); g=new SearchEngineGenerator(); if(args[0].equalsIgnoreCase("CLEAR")) { save(); } else if(args[0].equalsIgnoreCase("ADD")) { if(args.length<3||args.length>4) usage(); load(); try { URL home=new URL(args[1]); URL doc=new URL(home, args[2]); System.out.println("Indiziere "+doc.toString()); DocumentParser p=new DocumentParser(g, doc.openStream()); p.parse(args.length>=4 ? args[3] : args[2]); } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } save(); } else if(args[0].equalsIgnoreCase("ADDW") ||args[0].equalsIgnoreCase("ADDWS")) { boolean sub=args[0].equalsIgnoreCase("ADDWS"); if(args.length!=3) usage(); load(); addDir(new File(args[1]), new String(""), new WildcardFilter(args[2]), sub); save(); } else if(args[0].equalsIgnoreCase("SPIDER")) { if(args.length!=3) usage(); load(); try { URL home=new URL(args[1]); SearchEngineSpider spider=new SearchEngineSpider(home, args[2]); spider.run(g); } catch(Exception e) { System.err.println("Exception: "+e.getMessage()); } save(); } else usage(); } }