/*
 * Decompiled with CFR 0.152.
 */
package org.apache.cocoon.components.crawler;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import org.apache.avalon.excalibur.pool.Recyclable;
import org.apache.avalon.framework.activity.Disposable;
import org.apache.avalon.framework.configuration.Configurable;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.logger.AbstractLogEnabled;
import org.apache.cocoon.Constants;
import org.apache.cocoon.components.crawler.CocoonCrawler;
import org.apache.cocoon.util.Tokenizer;
import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;

public class SimpleCocoonCrawlerImpl
extends AbstractLogEnabled
implements CocoonCrawler,
Configurable,
Disposable,
Recyclable {
    public static final String LINK_CONTENT_TYPE_CONFIG = "link-content-type";
    public final String LINK_CONTENT_TYPE_DEFAULT = "application/x-cocoon-links";
    public static final String LINK_VIEW_QUERY_CONFIG = "link-view-query";
    public static final String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
    public static final String EXCLUDE_CONFIG = "exclude";
    public static final String INCLUDE_CONFIG = "include";
    public static final String USER_AGENT_CONFIG = "user-agent";
    public static final String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
    public static final String ACCEPT_CONFIG = "accept";
    public static final String ACCEPT_DEFAULT = "*/*";
    private String linkViewQuery = "cocoon-view=links";
    private String linkContentType = "application/x-cocoon-links";
    private HashSet excludeCrawlingURL = null;
    private HashSet includeCrawlingURL = null;
    private String userAgent = USER_AGENT_DEFAULT;
    private String accept = "*/*";
    private HashSet crawled;
    private HashSet urlsToProcess;

    public void configure(Configuration configuration) throws ConfigurationException {
        String value;
        String tokenized_pattern;
        Tokenizer t;
        String pattern;
        int i;
        Configuration[] children = configuration.getChildren(INCLUDE_CONFIG);
        if (children.length > 0) {
            this.includeCrawlingURL = new HashSet();
            i = 0;
            while (i < children.length) {
                pattern = children[i].getValue();
                try {
                    t = new Tokenizer(pattern, ", ");
                    while (t.hasMoreTokens()) {
                        tokenized_pattern = t.nextToken();
                        this.includeCrawlingURL.add(new RE(tokenized_pattern));
                    }
                }
                catch (RESyntaxException rese) {
                    this.getLogger().error("Cannot create including regular-expression for " + pattern, (Throwable)rese);
                }
                ++i;
            }
        } else if (this.getLogger().isDebugEnabled()) {
            this.getLogger().debug("Include all URLs");
        }
        children = configuration.getChildren(EXCLUDE_CONFIG);
        if (children.length > 0) {
            this.excludeCrawlingURL = new HashSet();
            i = 0;
            while (i < children.length) {
                pattern = children[i].getValue();
                try {
                    t = new Tokenizer(pattern, ", ");
                    while (t.hasMoreTokens()) {
                        tokenized_pattern = t.nextToken();
                        this.excludeCrawlingURL.add(new RE(tokenized_pattern));
                    }
                }
                catch (RESyntaxException rese) {
                    this.getLogger().error("Cannot create excluding regular-expression for " + pattern, (Throwable)rese);
                }
                ++i;
            }
        } else {
            this.excludeCrawlingURL = new HashSet();
            this.setDefaultExcludeFromCrawling();
            if (this.getLogger().isDebugEnabled()) {
                this.getLogger().debug("Exclude default URLs only");
            }
        }
        Configuration child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
        if (child != null && (value = child.getValue()) != null && value.length() > 0) {
            this.linkContentType = value.trim();
        }
        if ((child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false)) != null && (value = child.getValue()) != null && value.length() > 0) {
            this.linkViewQuery = value.trim();
        }
        if ((child = configuration.getChild(USER_AGENT_CONFIG, false)) != null && (value = child.getValue()) != null && value.length() > 0) {
            this.userAgent = value;
        }
        if ((child = configuration.getChild(ACCEPT_CONFIG, false)) != null && (value = child.getValue()) != null && value.length() > 0) {
            this.accept = value;
        }
    }

    public void dispose() {
        this.crawled = null;
        this.urlsToProcess = null;
        this.excludeCrawlingURL = null;
        this.includeCrawlingURL = null;
    }

    public void recycle() {
        this.crawled = null;
        this.urlsToProcess = null;
    }

    public void crawl(URL url) {
        this.crawled = new HashSet();
        this.urlsToProcess = new HashSet();
        if (this.getLogger().isDebugEnabled()) {
            this.getLogger().debug("crawl URL " + url);
        }
        this.urlsToProcess.add(url);
    }

    public Iterator iterator() {
        return new CocoonCrawlerIterator(this);
    }

    private void setDefaultExcludeFromCrawling() {
        String[] EXCLUDE_FROM_CRAWLING_DEFAULT = new String[]{".*\\.gif(\\?.*)?$", ".*\\.png(\\?.*)?$", ".*\\.jpe?g(\\?.*)?$", ".*\\.js(\\?.*)?$", ".*\\.css(\\?.*)?$"};
        int i = 0;
        while (i < EXCLUDE_FROM_CRAWLING_DEFAULT.length) {
            String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
            try {
                this.excludeCrawlingURL.add(new RE(pattern));
            }
            catch (RESyntaxException rese) {
                this.getLogger().error("Cannot create excluding regular-expression for " + pattern, (Throwable)rese);
            }
            ++i;
        }
    }

    /*
     * Unable to fully structure code
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    private List getLinks(URL url) {
        block25: {
            block24: {
                url_links = null;
                sURL = url.toString();
                if (this.isIncludedURL(sURL) == false) return null;
                if (this.isExcludedURL(sURL)) {
                    return null;
                }
                if (this.crawled.contains(sURL)) {
                    return null;
                }
                this.crawled.add(sURL);
                if (this.getLogger().isDebugEnabled()) {
                    this.getLogger().debug("Getting links of URL " + sURL);
                }
                br = null;
                try {
                    try {
                        sURL = url.getFile();
                        links = new URL(url, sURL + (sURL.indexOf("?") == -1 ? "?" : "&") + this.linkViewQuery);
                        links_url_connection = links.openConnection();
                        is = links_url_connection.getInputStream();
                        br = new BufferedReader(new InputStreamReader(is));
                        contentType = links_url_connection.getContentType();
                        if (contentType == null) {
                            if (this.getLogger().isDebugEnabled()) {
                                this.getLogger().debug("Ignoring " + sURL + " (no content type)");
                            }
                            var9_10 = null;
                            var14_12 = null;
                            if (br == null) return var9_10;
                            break block24;
                        }
                        index = contentType.indexOf(59);
                        if (index != -1) {
                            contentType = contentType.substring(0, index);
                        }
                        if (this.getLogger().isDebugEnabled()) {
                            this.getLogger().debug("Content-type: " + contentType);
                        }
                        if (contentType.equals(this.linkContentType)) {
                            url_links = new ArrayList<URL>();
                            while ((line = br.readLine()) != null) {
                                new_url = new URL(url, (String)var10_20);
                                add_url = true;
                                if (add_url) {
                                    add_url &= url_links.contains(new_url) == false;
                                }
                                if (add_url) {
                                    add_url &= this.crawled.contains(new_url.toString()) == false;
                                }
                                if (add_url) {
                                    add_url &= this.isIncludedURL(new_url.toString());
                                }
                                if (add_url) {
                                    add_url &= this.isExcludedURL(new_url.toString()) == false;
                                }
                                if (!add_url) continue;
                                if (this.getLogger().isDebugEnabled()) {
                                    this.getLogger().debug("Add URL: " + new_url.toString());
                                }
                                url_links.add(new_url);
                            }
                        }
                        break block25;
                    }
                    catch (IOException ioe) {
                        this.getLogger().warn("Problems get links of " + url, (Throwable)ioe);
                        var14_14 = null;
                        if (br == null) return url_links;
                        try {
                            br.close();
                            br = null;
                            return url_links;
                        }
                        catch (IOException ignored) {
                            return url_links;
                        }
                    }
                }
                catch (Throwable var13_23) {
                    var14_15 = null;
                    if (br == null) throw var13_23;
                    ** try [egrp 2[TRYBLOCK] [2 : 574->585)] { 
lbl70:
                    // 1 sources

                    br.close();
                    br = null;
                    throw var13_23;
lbl73:
                    // 1 sources

                    catch (IOException ignored) {
                        // empty catch block
                    }
                    throw var13_23;
                }
            }
            ** try [egrp 2[TRYBLOCK] [2 : 574->585)] { 
lbl78:
            // 1 sources

            br.close();
            return var9_10;
lbl80:
            // 1 sources

            catch (IOException ignored) {
                // empty catch block
            }
            return var9_10;
        }
        var14_13 = null;
        if (br == null) return url_links;
        try {}
        catch (IOException ignored) {}
        br.close();
        br = null;
        return url_links;
        return url_links;
    }

    private boolean isExcludedURL(String url) {
        if (this.excludeCrawlingURL == null) {
            return false;
        }
        String s = url.toString();
        Iterator i = this.excludeCrawlingURL.iterator();
        while (i.hasNext()) {
            RE pattern = (RE)i.next();
            if (!pattern.match(s)) continue;
            if (this.getLogger().isDebugEnabled()) {
                this.getLogger().debug("Excluded URL " + url);
            }
            return true;
        }
        if (this.getLogger().isDebugEnabled()) {
            this.getLogger().debug("Not excluded URL " + url);
        }
        return false;
    }

    private boolean isIncludedURL(String url) {
        if (this.includeCrawlingURL == null) {
            return true;
        }
        String s = url.toString();
        Iterator i = this.includeCrawlingURL.iterator();
        while (i.hasNext()) {
            RE pattern = (RE)i.next();
            if (!pattern.match(s)) continue;
            if (this.getLogger().isDebugEnabled()) {
                this.getLogger().debug("Included URL " + url);
            }
            return true;
        }
        if (this.getLogger().isDebugEnabled()) {
            this.getLogger().debug("Not included URL " + url);
        }
        return false;
    }

    public static class CocoonCrawlerIterator
    implements Iterator {
        private SimpleCocoonCrawlerImpl cocoonCrawler;

        CocoonCrawlerIterator(SimpleCocoonCrawlerImpl cocoonCrawler) {
            this.cocoonCrawler = cocoonCrawler;
        }

        public boolean hasNext() {
            return this.cocoonCrawler.urlsToProcess.size() > 0;
        }

        public Object next() {
            URL url = null;
            Iterator i = this.cocoonCrawler.urlsToProcess.iterator();
            if (i.hasNext()) {
                url = (URL)i.next();
                this.cocoonCrawler.urlsToProcess.remove(url);
                List url_links = this.cocoonCrawler.getLinks(url);
                if (url_links != null) {
                    this.cocoonCrawler.urlsToProcess.addAll(url_links);
                }
            }
            return url;
        }

        public void remove() {
            throw new UnsupportedOperationException("remove is not implemented");
        }
    }
}

