java實現(xiàn)登錄之后抓取數(shù)據(jù)
最近做了一個從網(wǎng)絡(luò)上抓取數(shù)據(jù)的一個小程序。主要關(guān)于信貸方面,收集的一些黑名單網(wǎng)站,從該網(wǎng)站上抓取到自己系統(tǒng)中。
也找了一些資料,覺得沒有一個很好的,全面的例子。因此在這里做個筆記提醒自己。
首先需要一個jsoup的jar包,我用的1.6.0。。下載地址為:http://pan.baidu.com/s/1mgqOuHa
1,獲取網(wǎng)頁內(nèi)容(核心代碼,技術(shù)有限沒封裝)。
2,登錄之后抓取網(wǎng)頁數(shù)據(jù)(如何在請求中攜帶cookie)。
3,獲取網(wǎng)站的ajax請求方法(返回json)。
以上這三點我就用一個類全部包含(比較糙望見諒,直接copy代碼過去,應(yīng)該就可以用)
一,這個類分別有這上面的1,2,3三中方法,直接main方法可以進(jìn)行測試
package com.minxinloan.black.web.utils;
import java.io.BufferedReader;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.StringTokenizer;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import org.jsoup.Connection;
import org.jsoup.Connection.Method;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class CookieUtil {
public final static String CONTENT_TYPE = "Content-Type";
public static void main(String[] args) {
//String loginURL = "http://www.p2peye.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=Lsc66&username=puqiuxiaomao&password=a1234567";
String listURL = "http://www.p2peye.com/blacklist.php?p=2";
String logURL = "http://www.p2peye.com/member.php";
//********************************需要登錄的*************************************************
try {
Connection.Response res =
Jsoup.connect(logURL)
.data("mod","logging"
,"action","login"
,"loginsubmit","yes"
,"loginhash","Lsc66"
,"username","puqiuxiaomao"
,"password","a1234567")
.method(Method.POST)
.execute();
//這兒的SESSIONID需要根據(jù)要登錄的目標(biāo)網(wǎng)站設(shè)置的session Cookie名字而定
Connection con=Jsoup.connect(listURL);
//設(shè)置訪問形式(電腦訪問,手機(jī)訪問):直接百度都參數(shù)設(shè)置
con.header("User-Agent", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)");
//把登錄信息的cookies保存如map對象里面
Map <String,String> map=res.cookies();
Iterator<Entry<String,String>> it =map.entrySet().iterator();
while(it.hasNext()){
Entry<String,String> en= it.next();
//把登錄的信息放入請求里面
con =con.cookie(en.getKey(), en.getValue());
}
//再次獲取Document對象。
Document objectDoc = con.get();
Elements elements = objectDoc.getAllElements();//獲取這個連接返回頁面的源碼內(nèi)容(不是源碼跟源碼差不多)
for (Element element : elements) {
//element是迭代出來的標(biāo)簽:如:<div><span></span></div>
Elements elements2= element.getAllElements();//
for (Element element2 : elements2) {
element2.text();
element2.attr("href");//獲取標(biāo)簽屬性。element2代表a標(biāo)簽:href代表屬性
element2.text();//獲取標(biāo)簽文本
}
}
//********************************不需要登錄的*************************************************
String URL = "http://www.p2peye.com/blacklist.php?p=2";
Document conTemp = Jsoup.connect(URL).get();
Elements elementsTemps = conTemp.getAllElements();
for (Element elementsTemp : elementsTemps) {
elementsTemp.text();
elementsTemp.attr("href");//獲取標(biāo)簽屬性。element2代表a標(biāo)簽:href代表屬性
elementsTemp.text();//獲取標(biāo)簽文本
}
//********************************ajax方法獲取內(nèi)容。。。*************************************************。
HttpURLConnection connection = null;
BufferedReader reader = null;
try {
StringBuffer sb = new StringBuffer();
URL getUrl = new URL(URL);
connection = (HttpURLConnection)getUrl.openConnection();
reader = new BufferedReader(new InputStreamReader(
connection.getInputStream(),"utf-8"));
String lines;
while ((lines = reader.readLine()) != null) {
sb.append(lines);
};
List<Map<String, Object>> list = parseJSON2List(sb.toString());//json轉(zhuǎn)換成list
} catch (Exception e) {
} finally{
if(reader!=null)
try {
reader.close();
} catch (IOException e) {
}
// 斷開連接
connection.disconnect();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static Map<String, Object> parseJSON2Map(String jsonStr){
Map<String, Object> map = new HashMap<String, Object>();
//最外層解析
JSONObject json = JSONObject.fromObject(jsonStr);
for(Object k : json.keySet()){
Object v = json.get(k);
//如果內(nèi)層還是數(shù)組的話,繼續(xù)解析
if(v instanceof JSONArray){
List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
Iterator<JSONObject> it = ((JSONArray)v).iterator();
while(it.hasNext()){
JSONObject json2 = it.next();
list.add(parseJSON2Map(json2.toString()));
}
map.put(k.toString(), list);
} else {
map.put(k.toString(), v);
}
}
return map;
}
public static List<Map<String, Object>> parseJSON2List(String jsonStr){
JSONArray jsonArr = JSONArray.fromObject(jsonStr);
List<Map<String, Object>> list = new ArrayList<Map<String,Object>>();
Iterator<JSONObject> it = jsonArr.iterator();
while(it.hasNext()){
JSONObject json2 = it.next();
list.add(parseJSON2Map(json2.toString()));
}
return list;
}
}
二,這個是獲取驗證碼的類,可以研究下。(但你要要分析出網(wǎng)站的驗證碼的請求地址)
package com.minxinloan.black.web.utils;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
public class Utils {//解析驗證碼的
public static Content getRandom(String method, String sUrl,// 要解析的url
Map<String, String> paramMap, // 存放用戶名和密碼的map
Map<String, String> requestHeaderMap,// 存放COOKIE的map
boolean isOnlyReturnHeader, String path) {
Content content = null;
HttpURLConnection httpUrlConnection = null;
InputStream in = null;
try {
URL url = new URL(sUrl);
boolean isPost = "POST".equals(method);
if (method == null
|| (!"GET".equalsIgnoreCase(method) && !"POST"
.equalsIgnoreCase(method))) {
method = "POST";
}
URL resolvedURL = url;
URLConnection urlConnection = resolvedURL.openConnection();
httpUrlConnection = (HttpURLConnection) urlConnection;
httpUrlConnection.setRequestMethod(method);
httpUrlConnection.setRequestProperty("Accept-Language",
"zh-cn,zh;q=0.5");
// Do not follow redirects, We will handle redirects ourself
httpUrlConnection.setInstanceFollowRedirects(false);
httpUrlConnection.setDoOutput(true);
httpUrlConnection.setDoInput(true);
httpUrlConnection.setConnectTimeout(5000);
httpUrlConnection.setReadTimeout(5000);
httpUrlConnection.setUseCaches(false);
httpUrlConnection.setDefaultUseCaches(false);
httpUrlConnection.connect();
int responseCode = httpUrlConnection.getResponseCode();
if (responseCode == HttpURLConnection.HTTP_OK
|| responseCode == HttpURLConnection.HTTP_CREATED) {
byte[] bytes = new byte[0];
if (!isOnlyReturnHeader) {
DataInputStream ins = new DataInputStream(
httpUrlConnection.getInputStream());
// 驗證碼的位置
DataOutputStream out = new DataOutputStream(
new FileOutputStream(path + "/code.bmp"));
byte[] buffer = new byte[4096];
int count = 0;
while ((count = ins.read(buffer)) > 0) {
out.write(buffer, 0, count);
}
out.close();
ins.close();
}
String encoding = null;
if (encoding == null) {
encoding = getEncodingFromContentType(httpUrlConnection
.getHeaderField(""));
}
content = new Content(sUrl, new String(bytes, encoding),
httpUrlConnection.getHeaderFields());
}
} catch (Exception e) {
return null;
} finally {
if (httpUrlConnection != null) {
httpUrlConnection.disconnect();
}
}
return content;
}
public static String getEncodingFromContentType(String contentType) {
String encoding = null;
if (contentType == null) {
return null;
}
StringTokenizer tok = new StringTokenizer(contentType, ";");
if (tok.hasMoreTokens()) {
tok.nextToken();
while (tok.hasMoreTokens()) {
String assignment = tok.nextToken().trim();
int eqIdx = assignment.indexOf('=');
if (eqIdx != -1) {
String varName = assignment.substring(0, eqIdx).trim();
if ("charset".equalsIgnoreCase(varName)) {
String varValue = assignment.substring(eqIdx + 1)
.trim();
if (varValue.startsWith("\"")
&& varValue.endsWith("\"")) {
// substring works on indices
varValue = varValue.substring(1,
varValue.length() - 1);
}
if (Charset.isSupported(varValue)) {
encoding = varValue;
}
}
}
}
}
if (encoding == null) {
return "UTF-8";
}
return encoding;
}
// 這個是輸出
public static boolean inFile(String content, String path) {
PrintWriter out = null;
File file = new File(path);
try {
if (!file.exists()) {
file.createNewFile();
}
out = new PrintWriter(new FileWriter(file));
out.write(content);
out.flush();
return true;
} catch (Exception e) {
e.printStackTrace();
} finally {
out.close();
}
return false;
}
public static String getHtmlReadLine(String httpurl) {
String CurrentLine = "";
String TotalString = "";
InputStream urlStream;
String content = "";
try {
URL url = new URL(httpurl);
HttpURLConnection connection = (HttpURLConnection) url
.openConnection();
connection.connect();
System.out.println(connection.getResponseCode());
urlStream = connection.getInputStream();
BufferedReader reader = new BufferedReader(
new InputStreamReader(urlStream, "utf-8"));
while ((CurrentLine = reader.readLine()) != null) {
TotalString += CurrentLine + "\n";
}
content = TotalString;
} catch (Exception e) {
}
return content;
}
}
class Content {
private String url;
private String body;
private Map<String, List<String>> m_mHeaders = new HashMap<String, List<String>>();
public Content(String url, String body, Map<String, List<String>> headers) {
this.url = url;
this.body = body;
this.m_mHeaders = headers;
}
public String getUrl() {
return url;
}
public String getBody() {
return body;
}
public Map<String, List<String>> getHeaders() {
return m_mHeaders;
}
}
相關(guān)文章
java增強(qiáng)for循環(huán)的實現(xiàn)方法
下面小編就為大家?guī)硪黄猨ava增強(qiáng)for循環(huán)的實現(xiàn)方法。小編覺得挺不錯的,現(xiàn)在就分享給大家,也給大家做個參考。一起跟隨小編過來看看吧2016-09-09
Kotlin 基礎(chǔ)教程之?dāng)?shù)組容器
這篇文章主要介紹了Kotlin 基礎(chǔ)教程之?dāng)?shù)組容器的相關(guān)資料,需要的朋友可以參考下2017-06-06
Springboot多數(shù)據(jù)源配置之整合dynamic-datasource方式
這篇文章主要介紹了Springboot多數(shù)據(jù)源配置之整合dynamic-datasource方式,具有很好的參考價值,希望對大家有所幫助。如有錯誤或未考慮完全的地方,望不吝賜教2023-03-03
hibernate通過session實現(xiàn)增刪改查操作實例解析
這篇文章主要介紹了hibernate通過session實現(xiàn)增刪改查操作實例解析,具有一定借鑒價值,需要的朋友可以參考下。2017-12-12

