nacitate root url, najdete vsetky linky (zacinajuce http:// a konciace " ´ alebo niecim inym na co musite prist..)
uvazujete len tie ktore matchuju regexp. pouzit triedy Pattern, Matcher a metodu find.
rekurzivne prejdete vsetky vyhovujuce adresy. ak content type nie je html, len si zapamatate ze taka adresa bola, pri html idete do rekurzie.
vypisete vsetky najdene relevantne adresy s cetnostou.
prezentovat bolo nutne z konzoly, tj v eclipse date export - executable jar.
ocenujem, ze zadanie bolo vytlacene.
nedostatok bol, ze sme nemali testovacie vstupy, museli sme pana cviciaceho stale volat na kontrolu. keby boli k dispozicii aspon nejake vstupy, bolo by to super.
Kód: Vybrat vše
package test;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class crawler {
static String rootUrl;
static String acceptPattern;
static int maxUrls;
static int collected=0;
static HashMap<String,Integer> crawled=new HashMap<String, Integer>();
public static void main(String[] args)
{
if(args.length<2) { System.out.println("bad input"); System.exit(1); }
rootUrl=args[0];
acceptPattern=args[1];
if(args.length>2)
{
try
{
maxUrls=Integer.parseInt(args[2]);
}
catch(Exception e) { System.out.println("bad input"); System.exit(1); }
if(maxUrls<0) { System.out.println("bad input"); System.exit(1); }
if(maxUrls==0) { maxUrls=Integer.MAX_VALUE; }
}
else
{
maxUrls=Integer.MAX_VALUE;
}
if(rootUrl.matches(acceptPattern))
{
crawled.put(rootUrl, 1);
collected++;
}
crawl(rootUrl);
print();
}
private static void crawl(String url)
{
String result=urlConn(url);
ArrayList<String> links=new ArrayList<String>();
if(result!=null)
{
while(result.contains("http://"))
{
result=result.substring(result.indexOf("http://"));
String[] w=result.split("\"|\'|\\(|\\<|\\>|\\)",2); //cim vsetkym moze koncit url
result=result.replaceFirst("http://", ""); //vymazem v zdrojovom stringu http aby som mohol najst dalsie
links.add(w[0]);
}
ArrayList<String> toDo=new ArrayList<String>();
for(String link:links)
{
Pattern patt = Pattern.compile(acceptPattern);
Matcher m = patt.matcher(link);
if(!m.find()) { continue; }
if(crawled.containsKey(link))
{
crawled.put(link, crawled.get(link)+1);
}
else
{
if(collected++==maxUrls)
{
print();
}
crawled.put(link, 1);
toDo.add(link);
}
}
for(String link:toDo)
{
crawl(link);
}
}
}
private static String urlConn(String url) {
try
{
URL myurl = new URL(url);
HttpURLConnection huc = (HttpURLConnection) myurl.openConnection();
HttpURLConnection.setFollowRedirects(false);
huc.setConnectTimeout(1000);
huc.setReadTimeout(300);
huc.connect();
String contentType=huc.getContentType().toLowerCase();
if(!contentType.matches("text[-/]html.*"))
{
return null;
}
BufferedReader in = new BufferedReader(
new InputStreamReader(
huc.getInputStream()));
String inputLine;
String output="";
while ((inputLine = in.readLine()) != null)
output+=inputLine;
in.close();
return output;
}
catch(Exception e)
{
return null;
}
}
private static void print()
{
for(Entry<String,Integer> entry:crawled.entrySet())
{
System.out.println(String.format("[%d]\t%s", entry.getValue(),entry.getKey()));
}
System.exit(0);
}
}