java代理實(shí)現(xiàn)爬取代理IP的示例
僅僅使用了一個(gè)java文件,運(yùn)行main方法即可,需要依賴的jar包是com.alibaba.fastjson(版本1.2.28)和Jsoup(版本1.10.2)
如果用了pom,那么就是以下兩個(gè):
<dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.28</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </dependency>
完整的代碼如下:
package com.tuniu.fcm.facade.IPProxy; import com.alibaba.fastjson.JSONObject; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 獲取代理IP,需要 * com.alibaba.fastjson.JSONObject以及Jsoup */ public class ProxyCralwerUnusedVPN { ThreadLocal<Integer> localWantedNumber = new ThreadLocal<Integer>(); ThreadLocal<List<ProxyInfo>> localProxyInfos = new ThreadLocal<List<ProxyInfo>>(); public static void main(String[] args) { ProxyCralwerUnusedVPN proxyCrawler = new ProxyCralwerUnusedVPN(); /** * 想要獲取的代理IP個(gè)數(shù),由需求方自行指定。(如果個(gè)數(shù)太多,將導(dǎo)致返回變慢) */ proxyCrawler.startCrawler(1); } /** * 暴露給外部模塊調(diào)用的入口 * @param wantedNumber 調(diào)用方期望獲取到的代理IP個(gè)數(shù) */ public String startCrawler(int wantedNumber) { localWantedNumber.set(wantedNumber); kuaidailiCom("http://www.xicidaili.com/nn/", 15); kuaidailiCom("http://www.xicidaili.com/nt/", 15); kuaidailiCom("http://www.xicidaili.com/wt/", 15); kuaidailiCom("http://www.kuaidaili.com/free/inha/", 15); kuaidailiCom("http://www.kuaidaili.com/free/intr/", 15); kuaidailiCom("http://www.kuaidaili.com/free/outtr/", 15); /** * 構(gòu)造返回?cái)?shù)據(jù) */ ProxyResponse response = new ProxyResponse(); response.setSuccess("true"); Map<String, Object> dataInfoMap = new HashMap<String, Object>(); dataInfoMap.put("numFound", localProxyInfos.get().size()); dataInfoMap.put("pageNum", 1); dataInfoMap.put("proxy", localProxyInfos.get()); response.setData(dataInfoMap); String responseString = JSONObject.toJSON(response).toString(); System.out.println(responseString); return responseString; } private void kuaidailiCom(String baseUrl, int totalPage) { String ipReg = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3} \\d{1,6}"; Pattern ipPtn = Pattern.compile(ipReg); for (int i = 1; i < totalPage; i++) { if (getCurrentProxyNumber() >= localWantedNumber.get()) { return; } try { Document doc = Jsoup.connect(baseUrl + i + "/") .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") .header("Accept-Encoding", "gzip, deflate, sdch") .header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6") .header("Cache-Control", "max-age=0") .header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36") .header("Cookie", "Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=GA1.2.1061361785.1462812244") .header("Host", "www.kuaidaili.com") .header("Referer", "http://www.kuaidaili.com/free/outha/") .timeout(30 * 1000) .get(); Matcher m = ipPtn.matcher(doc.text()); while (m.find()) { if (getCurrentProxyNumber() >= localWantedNumber.get()) { break; } String[] strs = m.group().split(" "); if (checkProxy(strs[0], Integer.parseInt(strs[1]))) { System.out.println("獲取到可用代理IP\t" + strs[0] + "\t" + strs[1]); addProxy(strs[0], strs[1], "http"); } } } catch (Exception e) { e.printStackTrace(); } } } private static boolean checkProxy(String ip, Integer port) { try { //http://1212.ip138.com/ic.asp 可以換成任何比較快的網(wǎng)頁 Jsoup.connect("http://1212.ip138.com/ic.asp") .timeout(2 * 1000) .proxy(ip, port) .get(); return true; } catch (Exception e) { return false; } } private int getCurrentProxyNumber() { List<ProxyInfo> proxyInfos = localProxyInfos.get(); if (proxyInfos == null) { proxyInfos = new ArrayList<ProxyInfo>(); localProxyInfos.set(proxyInfos); return 0; } else { return proxyInfos.size(); } } private void addProxy(String ip, String port, String protocol){ List<ProxyInfo> proxyInfos = localProxyInfos.get(); if (proxyInfos == null) { proxyInfos = new ArrayList<ProxyInfo>(); proxyInfos.add(new ProxyInfo(ip, port, protocol)); } else { proxyInfos.add(new ProxyInfo(ip, port, protocol)); } } } class ProxyInfo { private String userName = ""; private String ip; private String password = ""; private String type; private String port; private int is_internet = 1; public ProxyInfo(String ip, String port, String type) { this.ip = ip; this.type = type; this.port = port; } public String getUserName() { return userName; } public void setUserName(String userName) { this.userName = userName; } public String getIp() { return ip; } public void setIp(String ip) { this.ip = ip; } public String getPassword() { return password; } public void setPassword(String password) { this.password = password; } public String getType() { return type; } public void setType(String type) { this.type = type; } public String getPort() { return port; } public void setPort(String port) { this.port = port; } public int getIs_internet() { return is_internet; } public void setIs_internet(int is_internet) { this.is_internet = is_internet; } } class ProxyResponse { private String success; private Map<String, Object> data; public String getSuccess() { return success; } public void setSuccess(String success) { this.success = success; } public Map<String, Object> getData() { return data; } public void setData(Map<String, Object> data) { this.data = data; } }
以上這篇java代理實(shí)現(xiàn)爬取代理IP的示例就是小編分享給大家的全部內(nèi)容了,希望能給大家一個(gè)參考,也希望大家多多支持腳本之家。
相關(guān)文章
Java中IO流簡介_動(dòng)力節(jié)點(diǎn)Java學(xué)院整理
Java io系統(tǒng)的設(shè)計(jì)初衷,就是為了實(shí)現(xiàn)“文件、控制臺、網(wǎng)絡(luò)設(shè)備”這些io設(shè)置的通信。接下來通過本文給大家介紹Java中IO流簡介,感興趣的朋友一起看看吧2017-05-05Springboot集成MongoDB無認(rèn)證與開啟認(rèn)證的配置方式
本文主要介紹了Springboot集成MongoDB無認(rèn)證與開啟認(rèn)證的配置方式,文中通過示例代碼介紹的非常詳細(xì),對大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友們下面隨著小編來一起學(xué)習(xí)學(xué)習(xí)吧2024-03-03Java實(shí)現(xiàn)文件分片上傳接口的示例代碼
這篇文章主要為大家詳細(xì)介紹了如何利用Java語言實(shí)現(xiàn)文件分片上傳的功能,文中的示例代碼講解詳細(xì),感興趣的小伙伴可以跟隨小編一起了解一下2022-07-07SpringCloud的@RefreshScope 注解你了解嗎
這篇文章主要介紹了Spring Cloud @RefreshScope 原理及使用,文中通過示例代碼介紹的非常詳細(xì),對大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友們下面隨著小編來一起學(xué)習(xí)學(xué)習(xí)吧2021-09-09Springboot使用Junit測試沒有插入數(shù)據(jù)的原因
這篇文章主要介紹了Springboot使用Junit測試沒有插入數(shù)據(jù)的原因,具有很好的參考價(jià)值,希望對大家有所幫助。一起跟隨小編過來看看吧2020-04-04