home *** CD-ROM | disk | FTP | other *** search
- package netscape.netcast.application;
-
- import java.io.InputStream;
- import java.io.StreamTokenizer;
- import java.net.URL;
- import java.util.Vector;
- import netscape.net.Cache;
- import netscape.net.CacheRequest;
-
- public class RobotExclusion {
- private static final int INFO_AVAILABLE = 1;
- private static final int INFO_NOT_AVAILABLE = 2;
- private static final int NOT_YET_QUERIED = 3;
- private static final String USERAGENT = "User-agent:";
- private static final String DISALLOW = "Disallow:";
- private static final String ASTERISK = "*";
- private static final String MOZILLA = "mozilla";
- private String site = "";
- private int isInfoAvailable = 3;
- private InputStream urlStream;
- private Vector infoList = new Vector();
-
- public RobotExclusion(String var1) {
- this.site = var1;
- }
-
- public boolean isRobotAllowed(URL var1, Cache var2) {
- try {
- if (this.isInfoAvailable == 3) {
- StringBuffer var3 = new StringBuffer(this.site);
- var3.append("/robots.txt");
- String var4 = new String(var3);
- this.isInfoAvailable = this.getInfoAvailability(var4, var2);
- }
-
- if (this.isInfoAvailable == 2) {
- return true;
- } else if (this.isInfoAvailable == 1) {
- return !this.matchPath(var1);
- } else {
- return true;
- }
- } catch (Exception var5) {
- HTMLCrawler.printException(var5, "RobotExclusion.isRobotAllowed");
- return true;
- }
- }
-
- public int getInfoAvailability(String var1, Cache var2) {
- try {
- CacheRequest var3 = new CacheRequest(var1);
- if (var3 == null) {
- return 2;
- } else {
- this.urlStream = var2.getInputStream(var3);
- if (this.urlStream == null) {
- return 2;
- } else {
- this.parseExclusionInfo();
- var2.remove(var1);
- return 1;
- }
- }
- } catch (Exception var4) {
- DebugManager.println("File " + var1 + " is not available");
- return 2;
- }
- }
-
- public void parseExclusionInfo() {
- try {
- StreamTokenizer var1 = new StreamTokenizer(this.urlStream);
- boolean var3 = false;
- boolean var4 = false;
- boolean var5 = false;
- boolean var6 = false;
- boolean var7 = false;
- boolean var8 = false;
- boolean var9 = false;
- var1.resetSyntax();
- var1.eolIsSignificant(true);
- var1.ordinaryChar(47);
- var1.commentChar(35);
- var1.wordChars(47, 47);
- var1.wordChars(58, 58);
- var1.wordChars(64, 64);
- var1.wordChars(38, 38);
- var1.wordChars(61, 61);
- var1.wordChars(37, 37);
- var1.wordChars(48, 57);
- var1.wordChars(97, 122);
- var1.wordChars(65, 90);
- var1.wordChars(36, 36);
- var1.wordChars(45, 45);
- var1.wordChars(95, 95);
- var1.wordChars(46, 46);
- var1.wordChars(43, 43);
- var1.wordChars(33, 33);
- var1.wordChars(42, 42);
- var1.wordChars(39, 39);
- var1.wordChars(40, 40);
- var1.wordChars(41, 41);
- var1.wordChars(44, 44);
- var1.wordChars(126, 126);
- var1.whitespaceChars(0, 32);
-
- int var2;
- while((var2 = var1.nextToken()) != -1 && !var4) {
- switch (var2) {
- case -3:
- if (var7) {
- var7 = false;
- }
-
- if (var8) {
- String var10 = var1.sval.toLowerCase();
- if (var10.equals("*") || var10.equals("mozilla")) {
- var3 = true;
- var8 = false;
- }
- }
-
- if (var1.sval.equals("User-agent:")) {
- var8 = true;
- var5 = true;
- } else if (var1.sval.indexOf("User-agent:") >= 0) {
- var8 = true;
- var5 = true;
- int var16 = var1.sval.indexOf(":");
- String var11 = var1.sval.substring(var16 + 1).toLowerCase();
- if (var11.equals("*") || var11.equals("mozilla")) {
- var3 = true;
- var8 = false;
- }
- }
-
- if (var9 && var3) {
- String var17 = this.decodeString(var1.sval);
- this.infoList.addElement(var17.toLowerCase());
- var9 = false;
- }
-
- if (var1.sval.equals("Disallow:")) {
- if (var3) {
- var9 = true;
- }
- } else if (var1.sval.indexOf("Disallow:") >= 0 && var3) {
- var9 = true;
- int var18 = var1.sval.indexOf(":");
- String var19 = var1.sval.substring(var18 + 1);
- String var12 = this.decodeString(var19);
- this.infoList.addElement(var12.toLowerCase());
- var9 = false;
- }
- break;
- case -2:
- if (var7) {
- var7 = false;
- }
- break;
- case 10:
- var8 = false;
- var9 = false;
- if (var7 && var5) {
- var6 = true;
- var5 = false;
- var7 = false;
- if (var3) {
- var3 = false;
- var4 = true;
- }
- break;
- }
-
- var7 = true;
- break;
- default:
- if (var7) {
- var7 = false;
- }
- }
- }
-
- DebugManager.println("infoList " + this.infoList);
- } catch (Exception var13) {
- DebugManager.println("File robots.txt is not available");
- }
- }
-
- public boolean matchPath(URL var1) {
- try {
- String var2 = var1.getFile();
- int var3 = this.infoList.size();
-
- for(int var4 = 0; var4 < var3; ++var4) {
- String var5 = (String)this.infoList.elementAt(var4);
- String var6 = this.decodeString(var2);
- if (var6.length() >= 0 && var6.toLowerCase().indexOf(var5) >= 0) {
- DebugManager.println("URLpath " + var1 + "matches record path " + var5);
- return true;
- }
- }
-
- return false;
- } catch (Exception var7) {
- HTMLCrawler.printException(var7, "RobotExclusion.matchPath");
- return false;
- }
- }
-
- public String decodeString(String var1) {
- try {
- char[] var2 = new char[]{';', '/', ':', '@', '=', '&'};
- byte var3 = 6;
- boolean var4 = false;
- int var5 = 0;
- if (var1.length() > 0) {
- while(var5 <= var1.length()) {
- var4 = false;
- if ((var5 = var1.indexOf("%", var5)) == -1 || var5 + 3 > var1.length()) {
- break;
- }
-
- String var6 = var1.substring(var5 + 1, var5 + 3);
- char var7 = (char)Integer.parseInt(var6, 16);
- int var8 = 0;
-
- while(true) {
- if (var8 < var3) {
- if (var7 != var2[var8]) {
- ++var8;
- continue;
- }
-
- var4 = true;
- var5 += 3;
- }
-
- if (!var4) {
- if (var5 + 3 <= var1.length()) {
- var1 = var1.substring(0, var5) + var7 + var1.substring(var5 + 3);
- }
-
- if (var7 == '%') {
- ++var5;
- }
- }
- break;
- }
- }
- }
-
- return var1;
- } catch (Exception var9) {
- HTMLCrawler.printException(var9, "RobotExclusion.decodeString");
- return var1;
- }
- }
- }
-