基于C#實(shí)現(xiàn)網(wǎng)頁(yè)爬蟲
本文實(shí)例為大家分享了基于C#實(shí)現(xiàn)網(wǎng)頁(yè)爬蟲的詳細(xì)代碼,供大家參考,具體內(nèi)容如下
HTTP請(qǐng)求工具類:
功能:
1、獲取網(wǎng)頁(yè)html
2、下載網(wǎng)絡(luò)圖片
using System; using System.Collections.Generic; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; using System.Windows.Forms; namespace Utils { /// <summary> /// HTTP請(qǐng)求工具類 /// </summary> public class HttpRequestUtil { /// <summary> /// 獲取頁(yè)面html /// </summary> public static string GetPageHtml(string url) { // 設(shè)置參數(shù) HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest; request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"; //發(fā)送請(qǐng)求并獲取相應(yīng)回應(yīng)數(shù)據(jù) HttpWebResponse response = request.GetResponse() as HttpWebResponse; //直到request.GetResponse()程序才開始向目標(biāo)網(wǎng)頁(yè)發(fā)送Post請(qǐng)求 Stream responseStream = response.GetResponseStream(); StreamReader sr = new StreamReader(responseStream, Encoding.UTF8); //返回結(jié)果網(wǎng)頁(yè)(html)代碼 string content = sr.ReadToEnd(); return content; } /// <summary> /// Http下載文件 /// </summary> public static void HttpDownloadFile(string url) { int pos = url.LastIndexOf("/") + 1; string fileName = url.Substring(pos); string path = Application.StartupPath + "\\download"; if (!Directory.Exists(path)) { Directory.CreateDirectory(path); } string filePathName = path + "\\" + fileName; if (File.Exists(filePathName)) return; // 設(shè)置參數(shù) HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest; request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"; request.Proxy = null; //發(fā)送請(qǐng)求并獲取相應(yīng)回應(yīng)數(shù)據(jù) HttpWebResponse response = request.GetResponse() as HttpWebResponse; //直到request.GetResponse()程序才開始向目標(biāo)網(wǎng)頁(yè)發(fā)送Post請(qǐng)求 Stream responseStream = response.GetResponseStream(); //創(chuàng)建本地文件寫入流 Stream stream = new FileStream(filePathName, FileMode.Create); byte[] bArr = new byte[1024]; int size = responseStream.Read(bArr, 0, (int)bArr.Length); while (size > 0) { stream.Write(bArr, 0, size); size = responseStream.Read(bArr, 0, (int)bArr.Length); } stream.Close(); responseStream.Close(); } } }
多線程爬取網(wǎng)頁(yè)代碼:
using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.IO; using System.Linq; using System.Text; using System.Text.RegularExpressions; using System.Threading; using System.Threading.Tasks; using System.Windows.Forms; using Utils; namespace 爬蟲 { public partial class Form1 : Form { List<Thread> threadList = new List<Thread>(); Thread thread = null; public Form1() { InitializeComponent(); } private void button1_Click(object sender, EventArgs e) { DateTime dtStart = DateTime.Now; button3.Enabled = true; button2.Enabled = true; button1.Enabled = false; int page = 0; int count = 0; int personCount = 0; lblPage.Text = "已完成頁(yè)數(shù):0"; int index = 0; for (int i = 1; i <= 10; i++) { thread = new Thread(new ParameterizedThreadStart(delegate(object obj) { for (int j = 1; j <= 10; j++) { try { index = (Convert.ToInt32(obj) - 1) * 10 + j; string pageHtml = HttpRequestUtil.GetPageHtml("http://tt.mop.com/c44/0/1_" + index.ToString() + ".html"); Regex regA = new Regex("<a[\\s]+class=\"J-userPic([^<>]*?)[\\s]+href=\"([^\"]*?)\""); Regex regImg = new Regex("<p class=\"tc mb10\"><img[\\s]+src=\"([^\"]*?)\""); MatchCollection mc = regA.Matches(pageHtml); foreach (Match match in mc) { int start = match.ToString().IndexOf("href=\""); string url = match.ToString().Substring(start + 6); int end = url.IndexOf("\""); url = url.Substring(0, end); if (url.IndexOf("/") == 0) { string imgPageHtml = HttpRequestUtil.GetPageHtml("http://tt.mop.com" + url); personCount++; lblPerson.Invoke(new Action(delegate() { lblPerson.Text = "已完成條數(shù):" + personCount.ToString(); })); MatchCollection mcImgPage = regImg.Matches(imgPageHtml); foreach (Match matchImgPage in mcImgPage) { start = matchImgPage.ToString().IndexOf("src=\""); string imgUrl = matchImgPage.ToString().Substring(start + 5); end = imgUrl.IndexOf("\""); imgUrl = imgUrl.Substring(0, end); if (imgUrl.IndexOf("http://i1") == 0) { try { HttpRequestUtil.HttpDownloadFile(imgUrl); count++; lblNum.Invoke(new Action(delegate() { lblNum.Text = "已下載圖片數(shù)" + count.ToString(); DateTime dt = DateTime.Now; double time = dt.Subtract(dtStart).TotalSeconds; if (time > 0) { lblSpeed.Text = "速度:" + (count / time).ToString("0.0") + "張/秒"; } })); } catch { } Thread.Sleep(1); } } } } } catch { } page++; lblPage.Invoke(new Action(delegate() { lblPage.Text = "已完成頁(yè)數(shù):" + page.ToString(); })); if (page == 100) { button1.Invoke(new Action(delegate() { button1.Enabled = true; })); MessageBox.Show("完成!"); } } })); thread.Start(i); threadList.Add(thread); } } private void button2_Click(object sender, EventArgs e) { button1.Invoke(new Action(delegate() { foreach (Thread thread in threadList) { if (thread.ThreadState == ThreadState.Suspended) { thread.Resume(); } thread.Abort(); } button1.Enabled = true; button2.Enabled = false; button3.Enabled = false; button4.Enabled = false; })); } private void Form1_FormClosing(object sender, FormClosingEventArgs e) { foreach (Thread thread in threadList) { thread.Abort(); } } private void button3_Click(object sender, EventArgs e) { foreach (Thread thread in threadList) { if (thread.ThreadState == ThreadState.Running) { thread.Suspend(); } } button3.Enabled = false; button4.Enabled = true; } private void button4_Click(object sender, EventArgs e) { foreach (Thread thread in threadList) { if (thread.ThreadState == ThreadState.Suspended) { thread.Resume(); } } button3.Enabled = true; button4.Enabled = false; } } }
截圖:
以上就是本文的全部?jī)?nèi)容,希望對(duì)大家的學(xué)習(xí)有所幫助。
相關(guān)文章
基于C#實(shí)現(xiàn)一個(gè)溫濕度監(jiān)測(cè)小工具
這篇文章主要為大家詳細(xì)介紹了如何基于C#實(shí)現(xiàn)一個(gè)溫濕度監(jiān)測(cè)小工具,文中的示例代碼講解詳細(xì),具有一定的借鑒價(jià)值,需要的可以參考一下2023-01-01C#實(shí)現(xiàn)的簡(jiǎn)單整數(shù)四則運(yùn)算計(jì)算器功能示例
這篇文章主要介紹了C#實(shí)現(xiàn)的簡(jiǎn)單整數(shù)四則運(yùn)算計(jì)算器功能,涉及C#界面布局、事件響應(yīng)及數(shù)值運(yùn)算等相關(guān)操作技巧,需要的朋友可以參考下2017-09-09Winform ComboBox如何獨(dú)立繪制下拉選項(xiàng)的字體顏色
這篇文章主要介紹了Winform ComboBox如何獨(dú)立繪制下拉選項(xiàng)的字體顏色,幫助大家更好的理解和使用c# winform,感興趣的朋友可以了解下2020-11-11