清華大佬耗費三個月吐血整理的幾百G的資源,免費分享!....>>>
package com.zzger.model; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.concurrent.CountDownLatch; import com.zzger.module.queue.UrlQueue; import com.zzger.util.HttpUtils; import com.zzger.util.RegexUtils; public class WebSite { /** * 站點url */ private String url; /** * 需要爬行的url隊列 */ private UrlQueue<String> urls = new UrlQueue<>(); /** * 已爬行過的頁面url */ private List<String> exitUrls = Collections.synchronizedList(new ArrayList<>()); private static final int TOTAL_THREADS = 12; private final CountDownLatch mStartSignal = new CountDownLatch(1); private final CountDownLatch mDoneSignal = new CountDownLatch(TOTAL_THREADS); public WebSite(String url){ this.url = url; urls.offer(url);//把網站首頁加入需要爬行的隊列中 } public void guangDu(){ new Thread(new Runnable() { @Override public void run() { paxing(HttpUtils.httpGet(url)); } }).start(); } public void paxing(String html){ if(html.lastIndexOf("下一頁</a></li></ul></div>")<0) return ; String strList = html.substring(html.indexOf("<li class=\\"next-page\\">"), html.lastIndexOf("下一頁</a></li></ul></div>")); String url = RegexUtils.RegexString("<a href=\\"(.+?)\\"", strList); if(url.equals("Nothing")) return ; urls.put(url);//把url存儲到隊列中 paxing(HttpUtils.httpGet(url)); } public void dxcPx(){ Page<DuanZi> page = new Gxpage(urls.take()); List<Section<DuanZi>> list = page.ybhqSection().getSections(); for(Section<DuanZi> section : list){ new Thread(new Runnable() { @Override public void run() { mStartSignal.countDown();// 計數減一為0,工作線程真正啟動具體操作 try { mStartSignal.await();// 阻塞,等待mStartSignal計數為0運行后面的代碼 // 所有的工作線程都在等待同一個啟動的命令 } catch (InterruptedException e) { e.printStackTrace(); } DuanZi duanzi = section.select().getModel(); System.out.println(duanzi.getTitle()); mDoneSignal.countDown();// 完成以后計數減一 } } ).start(); } try { mDoneSignal.await();// 等待所有工作線程結束 } catch (InterruptedException e) { e.printStackTrace(); } dxcPx();//線程任務執行完后,再次獲取url隊列進行任務 } public static void main(String[] args) { WebSite web = new WebSite("http://duanziwang.com"); web.guangDu(); for(int i = 0; i<10;i++){ new Thread(new Runnable() { @Override public void run() { web.dxcPx(); } }).start(); } } }