home *** CD-ROM | disk | FTP | other *** search
- import java.io.IOException;
- import java.io.InputStream;
- import java.net.URL;
-
- class DocumentParser {
- SearchEngineSpider spider;
- // $FF: renamed from: in java.io.InputStream
- InputStream field_0;
- // $FF: renamed from: g SearchEngineGenerator
- SearchEngineGenerator field_1;
- String section;
- URL url;
- int pos;
- int unget;
-
- public DocumentParser(SearchEngineGenerator var1, InputStream var2, SearchEngineSpider var3, URL var4) {
- this.field_1 = var1;
- this.field_0 = var2;
- this.spider = var3;
- this.url = var4;
- this.unget = -1;
- }
-
- public DocumentParser(SearchEngineGenerator var1, InputStream var2) {
- this.field_1 = var1;
- this.field_0 = var2;
- this.url = null;
- this.spider = null;
- this.unget = -1;
- }
-
- int get() throws IOException {
- int var1;
- if (this.unget >= 0) {
- var1 = this.unget;
- this.unget = -1;
- } else {
- var1 = this.field_0.read();
- }
-
- return var1;
- }
-
- static boolean isWS(int var0) {
- return var0 == 32 || var0 == 9 || var0 == 13 || var0 == 10;
- }
-
- static boolean isAlpha(int var0) {
- return Character.isLetterOrDigit((char)var0);
- }
-
- String getSection() throws IOException {
- StringBuffer var1 = new StringBuffer();
- this.pos = 0;
- boolean var4 = false;
- boolean var5 = false;
-
- int var2;
- while((var2 = this.get()) >= 0 && isWS(var2)) {
- }
-
- boolean var3 = var2 == 60;
- if (var2 >= 0) {
- this.unget = var2;
-
- while((var2 = this.get()) >= 0) {
- if (var3) {
- if (var4) {
- var1.append((char)var2);
- if (var2 == 34) {
- var5 = false;
- var4 = false;
- }
- continue;
- }
-
- if (var2 == 34) {
- var1.append('"');
- var4 = true;
- continue;
- }
-
- if (var2 == 62) {
- var1.append('>');
- break;
- }
- } else if (var2 == 60) {
- this.unget = var2;
- break;
- }
-
- if (isWS(var2)) {
- if (!var5) {
- var1.append(' ');
- var5 = true;
- }
- } else {
- var5 = false;
- var1.append((char)var2);
- }
- }
- }
-
- return this.section = HTMLcvt.fromHTML(var1.toString());
- }
-
- int getChar() {
- if (this.pos >= this.section.length()) {
- return -1;
- } else {
- char var1 = this.section.charAt(this.pos++);
- return var1;
- }
- }
-
- String getWord() {
- StringBuffer var1 = new StringBuffer();
-
- int var2;
- while((var2 = this.getChar()) >= 0 && !isAlpha(var2)) {
- }
-
- if (var2 >= 0) {
- do {
- var1.append((char)var2);
- } while((var2 = this.getChar()) >= 0 && isAlpha(var2));
- }
-
- return var1.toString();
- }
-
- void spiderFollow(String var1, String var2, String var3) {
- int var4 = var3.indexOf(var1);
- if (var4 >= 0) {
- var4 += var1.length();
-
- try {
- if (var2.charAt(var4) == ' ') {
- ++var4;
- }
-
- if (var2.charAt(var4++) == '=') {
- if (var2.charAt(var4) == ' ') {
- ++var4;
- }
-
- StringBuffer var5 = new StringBuffer();
- if (var2.charAt(var4) == '"') {
- while(true) {
- ++var4;
- if (var2.charAt(var4) == '"') {
- break;
- }
-
- var5.append(var2.charAt(var4));
- }
- } else {
- while(true) {
- char var6 = var2.charAt(var4++);
- if (var6 == ' ' || var6 == '>') {
- break;
- }
-
- var5.append((char)var6);
- }
- }
-
- this.spider.follow(this.url, var5.toString());
- }
- } catch (Exception var7) {
- }
- }
- }
-
- public void parse(String var1) throws IOException {
- int var2 = this.field_1.addDocument(var1, "");
- int var3 = 0;
- int var4 = 0;
- boolean var5 = false;
- boolean var6 = true;
- StringBuffer var7 = new StringBuffer();
- String var8 = null;
-
- String var9;
- while((var9 = this.getSection()).length() > 0) {
- if (var6) {
- System.out.print("\r * " + (var8 == null ? "" : var8 + ": ") + this.field_1.getDocs() + "/");
- var4 = 0;
- var6 = false;
- }
-
- if (var9.startsWith("<")) {
- String var12 = var9.toUpperCase();
- if (var12.startsWith("<TITLE")) {
- var5 = true;
- }
-
- if (var12.startsWith("</TITLE")) {
- var5 = false;
- var8 = var7.toString();
- var6 = true;
- }
-
- if (var12.startsWith("<A ") && this.spider != null) {
- this.spiderFollow("HREF", var9, var12);
- }
-
- if (var12.startsWith("<FRAME ") && this.spider != null) {
- this.spiderFollow("SRC", var9, var12);
- }
-
- if (var12.startsWith("<AREA ") && this.spider != null) {
- this.spiderFollow("HREF", var9, var12);
- }
- } else {
- if (var5) {
- if (var7.length() > 0) {
- var7.append(' ');
- }
-
- var7.append(this.section);
- }
-
- String var10;
- while((var10 = this.getWord()).length() > 0) {
- this.field_1.addWord(var2, var10.toUpperCase());
- StringBuffer var10000 = new StringBuffer();
- ++var3;
- String var11 = var10000.append(var3).append("/").append(this.field_1.getSize()).toString();
-
- while(var4-- > 0) {
- System.out.print('\b');
- }
-
- System.out.print(var11);
- var4 = var11.length();
- }
- }
- }
-
- if (var8 != null) {
- this.field_1.addDocument(var1, var8);
- }
-
- System.out.println("");
- }
- }
-