您好,欢迎来到哗拓教育。
搜索
您的当前位置:首页网络爬虫源代码【可编辑范本】

网络爬虫源代码【可编辑范本】

来源:哗拓教育
网络爬虫源代码

public class Spider implements Runnable

{

private ArrayList urls; //URL列表

private HashMap indexedURLs; //已经检索过的URL列表

private int threads ; //初始化线程数

public static void main(String argv[]) throws Exception

{

if(argv[0] == null)

{

System.out.println("Missing required argument: [Sit URL]\");

return ;

}

Spider Spider = new Spider(argv[0]);

Spider。go();

public Spider(String strURL)

urls = new ArrayList();

threads = 10;

urls。add(strURL);

threadList = new ArrayList();

indexedURLs = new HashMap();

if (urls.size() == 0)

throw new IllegalArgumentException(”Missing required argument: —u [start url]”);

if (threads 〈 1)

(\"Invalid number of threads: ” +

threads);

public void go(String strURL) throws Exception {

// index each entry point URL

long start = System.currentTimeMillis();

for (int i = 0; i < threads; i++) {

Thread t = new Thread(this, ”Spide ” t.start();

threadList.add(t);

}

while (threadList。size() >; 0) {

+1));

+ (i Thread child = (Thread)threadList.remove(0);

child.join();

}

long elapsed = System.currentTimeMillis() - start;

}

public void run() {

String url;

try {

while ((url = dequeueURL()) != null) {

indexURL(url);

}

}catch(Exception e) {

logger.info(e.getMessage());

//检测URL列表容器中有没有URL没有被解析,如果有则返回URL由线程继续执行

public synchronized String dequeueURL() throws Exception while (true) {

if (urls.size() >; 0)

{

return (String)urls。remove(0);

else {

threads-—;

if (threads >; 0)

{ wait();

threads++;

}

else

{

notifyAll();

return null;

}

}

}

/*

* 添加URL和当前URL的级数,并唤醒睡眠线程

*/

public synchronized void enqueueURL(String url,int level)

{

if (indexedURLs。get(url) == null)

{

urls。add(url);

indexedURLs。put(url, new Integer(level));

notifyAll();

}

}

/**

* 通过URL解析出网页内容并解析出页面上的URL

* @param url 页面链接

* @throws java。lang。Exception

*/

private void indexURL(String url) throws Exception

{

boolean flag = true ;

//判断网页链接的级别,系统默认为三级

int level = 1 ;

if (indexedURLs。get(url) == null)

indexedURLs。put(url, new Integer(level));

}

else{

level = ((Integer)indexedURLs。get(url))。intValue();

//只检测到页面的第二级

if(level >; 2 )

return ;

level++ ;

}

String strBody = null ;

try{

//解析页面内容

strBody = loadURL(url);

}catch(Exception e){

return ;

}

if (strBody != null) {

String urlGroups[] = null ;

try{

//解析出页面所以URL

urlGroups = parseURLs(summary);

}catch(Exception e){

logger.info(e。getMessage());

}

if(urlGroups == null)

urlGroups = new String[0] ;

strBody = null ;

for (int i = 0; i < urlGroups.length; i++) enqueueURL(urlGroups[i],level);

}

}

}

}

因篇幅问题不能全部显示,请点此查看更多更全内容

Copyright © 2019- huatuo2.com 版权所有 湘ICP备2023021991号-2

违法及侵权请联系:TEL:199 1889 7713 E-MAIL:2724546146@qq.com

本站由北京市万商天勤律师事务所王兴未律师提供法律服务