asp.net(c#)做一個網(wǎng)頁數(shù)據(jù)采集工具
更新時間:2009年12月16日 22:56:40 作者:
最近做一個網(wǎng)站,該網(wǎng)站需要添加4000多 產(chǎn)品信息,如果用人工方法去別的網(wǎng)站copy那至少要花費半月時間才能完成,所以我個辦法使用c#作出來了一個網(wǎng)頁數(shù)據(jù)采集軟件.
通過這個軟件一兩天就完成了幾千產(chǎn)品數(shù)據(jù)的錄入,可見很多工作不是一味用人工去做,作為一個程序員,就是要讓很多讓那些經(jīng)常做重復性的、繁瑣的工作中的人解放出來。下面只是寫了一些核心代碼,而且采集必須要和對應網(wǎng)站相掛鉤,作者:鄭少群
//提取產(chǎn)品列表頁中產(chǎn)品最終頁的網(wǎng)頁
private void button1_Click(object sender, EventArgs e)
{
if (textBox1.Text.Trim() == "" || textBox2.Text.Trim() == "")
{
MessageBox.Show("網(wǎng)址和域名不能為空!", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
return;
}
try
{
string Html = inc.GetHtml("http://study.pctoday.net.cn");
//ArrayList al = inc.GetMatchesStr(Html, "<a[^>]*?>.*?</a>");
ArrayList al = inc.GetMatchesStr(Html, @"href\s*=\s*(?:[\'\""\s](?<1>[^\""\']*)[\'\""])");//提取鏈接
" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale!
StringBuilder sb = new StringBuilder();
foreach (object var in al)
{
string a = var.ToString().Replace("\"", "").Replace("'", "");
a = Regex.Replace(a, "href=", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
if (a.StartsWith("/"))
a = textBox2.Text.Trim() + a;
if (!a.StartsWith("http://"))
a = "http://" + a;
sb.Append(a + "\r\n");
}
textBox5.Text = sb.ToString();//把提取到網(wǎng)址輸出到一個textBox,每個鏈接占一行
MessageBox.Show("共提取" + al.Count.ToString() + "個鏈接", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
}
catch (Exception err)
{
MessageBox.Show("提取出錯!原因:" + err.Message, "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
}
}
//把采集的產(chǎn)品頁面html代碼進行字符串處理,提取需要的代碼,最后保存到本地一個access數(shù)據(jù)庫中,同時提取產(chǎn)品圖片地址并自動現(xiàn)在圖片到本地images文件夾下
private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
{
//填充產(chǎn)品表
Database.ExecuteNonQuery("delete from Tb_Product");
DataTable dt2 = new DataTable();
OleDbConnection conn = new OleDbConnection(Database.ConnectionStrings);
OleDbDataAdapter da = new OleDbDataAdapter("select * from Tb_Product", conn);
OleDbCommandBuilder cb = new OleDbCommandBuilder(da);
da.Fill(dt2);
dt2.Rows.Clear();
BackgroundWorker worker = (BackgroundWorker)sender;//這個是做一個進度條
string[] Urls = textBox5.Text.Trim().ToLower().Replace("\r\n", ",").Split(',');
DataTable dt = new DataTable();
StringBuilder ErrorStr = new StringBuilder();
string html = "", ImageDir = AppDomain.CurrentDomain.BaseDirectory + "Images\\";
//循環(huán)每次采集網(wǎng)址
for (int i = 0; i < Urls.Length; i++)
{
try
{
if (!worker.CancellationPending)
{
if (Urls[i] == "")
return;
html = inc.GetHtml(Urls[i]);//獲取該url的html代碼
DataRow NewRow = dt2.NewRow();
//產(chǎn)品名
string ProductName = html.Substring(html.IndexOf("<title>") + 7);
NewRow["ProductName"] = ProductName.Remove(ProductName.IndexOf("</title>")).Trim();
//產(chǎn)品編號
NewRow["ModelId"] = NewRow["ProductName"].ToString().Substring(NewRow["ProductName"].ToString().IndexOf("Model:") + 6).Trim();
//產(chǎn)品介紹,這些都是根據(jù)不同網(wǎng)站的html做相應的修改
string Introduce = html.Substring(html.IndexOf("Product Details") + 26);
Introduce = Introduce.Remove(Introduce.IndexOf("</table>") + 8).Trim()
NewRow["Introduce"] = Introduce;
" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale!
//下載圖片
string ProductImage = html.Substring(html.IndexOf("align=center><img") + 17);
ProductImage = textBox2.Text.Trim() + ProductImage.Substring(ProductImage.IndexOf("src=\"") + 5);
ProductImage = ProductImage.Remove(ProductImage.IndexOf("\""));
try
{
inc.DownFile(ProductImage, ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1));
}
catch (Exception)
{
ErrorStr.Append("下載圖片失敗,圖片地址:" + ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1) + "\r\n");
}
dt2.Rows.Add(NewRow);
//Thread.Sleep(100);
worker.ReportProgress((i + 1) * 100 / Urls.Length, i);
toolStripStatusLabel1.Text = "處理進度:" + (i + 1).ToString() + "/" + Urls.Length.ToString();//進度條
}
}
catch (Exception err)
{
ErrorStr.Append("采集錯誤:" + err.Message + ";網(wǎng)址:" + Urls[i] + "\r\n");
}
}
da.Update(dt2);
DataBind(dt2);
ShowError(ErrorStr.ToString());
}
/// <summary>
/// ASPX頁面生成靜態(tài)Html頁面,作者:鄭少群
/// </summary>
public static string GetHtml(string url)
{
StreamReader sr = null;
string str = null;
//讀取遠程路徑
WebRequest request = WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(response.CharacterSet));
str = sr.ReadToEnd();
sr.Close();
return str;
}
// 提取HTML代碼中的網(wǎng)址
public static ArrayList GetMatchesStr(string htmlCode, string strRegex)
{
ArrayList al = new ArrayList();
Regex r = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline);
MatchCollection m = r.Matches(htmlCode);
for (int i = 0; i < m.Count; i++)
{
bool rep = false;
string strNew = m[i].ToString();
// 過濾重復的URL
foreach (string str in al)
{
if (strNew == str)
{
rep = true;
break;
}
}
if (!rep) al.Add(strNew);
}
al.Sort();
return al;
}
public static void DownFile(string Url, string Path)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
long size = response.ContentLength;
//創(chuàng)建文件流對象
using (FileStream fs = new FileStream(Path, FileMode.OpenOrCreate, FileAccess.Write))
{
byte[] b = new byte[1025];
int n = 0;
while ((n = stream.Read(b, 0, 1024)) > 0)
{
fs.Write(b, 0, n);
}
}
}
復制代碼 代碼如下:
//提取產(chǎn)品列表頁中產(chǎn)品最終頁的網(wǎng)頁
private void button1_Click(object sender, EventArgs e)
{
if (textBox1.Text.Trim() == "" || textBox2.Text.Trim() == "")
{
MessageBox.Show("網(wǎng)址和域名不能為空!", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
return;
}
try
{
string Html = inc.GetHtml("http://study.pctoday.net.cn");
//ArrayList al = inc.GetMatchesStr(Html, "<a[^>]*?>.*?</a>");
ArrayList al = inc.GetMatchesStr(Html, @"href\s*=\s*(?:[\'\""\s](?<1>[^\""\']*)[\'\""])");//提取鏈接
" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale!
StringBuilder sb = new StringBuilder();
foreach (object var in al)
{
string a = var.ToString().Replace("\"", "").Replace("'", "");
a = Regex.Replace(a, "href=", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
if (a.StartsWith("/"))
a = textBox2.Text.Trim() + a;
if (!a.StartsWith("http://"))
a = "http://" + a;
sb.Append(a + "\r\n");
}
textBox5.Text = sb.ToString();//把提取到網(wǎng)址輸出到一個textBox,每個鏈接占一行
MessageBox.Show("共提取" + al.Count.ToString() + "個鏈接", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
}
catch (Exception err)
{
MessageBox.Show("提取出錯!原因:" + err.Message, "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
}
}
//把采集的產(chǎn)品頁面html代碼進行字符串處理,提取需要的代碼,最后保存到本地一個access數(shù)據(jù)庫中,同時提取產(chǎn)品圖片地址并自動現(xiàn)在圖片到本地images文件夾下
private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
{
//填充產(chǎn)品表
Database.ExecuteNonQuery("delete from Tb_Product");
DataTable dt2 = new DataTable();
OleDbConnection conn = new OleDbConnection(Database.ConnectionStrings);
OleDbDataAdapter da = new OleDbDataAdapter("select * from Tb_Product", conn);
OleDbCommandBuilder cb = new OleDbCommandBuilder(da);
da.Fill(dt2);
dt2.Rows.Clear();
BackgroundWorker worker = (BackgroundWorker)sender;//這個是做一個進度條
string[] Urls = textBox5.Text.Trim().ToLower().Replace("\r\n", ",").Split(',');
DataTable dt = new DataTable();
StringBuilder ErrorStr = new StringBuilder();
string html = "", ImageDir = AppDomain.CurrentDomain.BaseDirectory + "Images\\";
//循環(huán)每次采集網(wǎng)址
for (int i = 0; i < Urls.Length; i++)
{
try
{
if (!worker.CancellationPending)
{
if (Urls[i] == "")
return;
html = inc.GetHtml(Urls[i]);//獲取該url的html代碼
DataRow NewRow = dt2.NewRow();
//產(chǎn)品名
string ProductName = html.Substring(html.IndexOf("<title>") + 7);
NewRow["ProductName"] = ProductName.Remove(ProductName.IndexOf("</title>")).Trim();
//產(chǎn)品編號
NewRow["ModelId"] = NewRow["ProductName"].ToString().Substring(NewRow["ProductName"].ToString().IndexOf("Model:") + 6).Trim();
//產(chǎn)品介紹,這些都是根據(jù)不同網(wǎng)站的html做相應的修改
string Introduce = html.Substring(html.IndexOf("Product Details") + 26);
Introduce = Introduce.Remove(Introduce.IndexOf("</table>") + 8).Trim()
NewRow["Introduce"] = Introduce;
" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale!
//下載圖片
string ProductImage = html.Substring(html.IndexOf("align=center><img") + 17);
ProductImage = textBox2.Text.Trim() + ProductImage.Substring(ProductImage.IndexOf("src=\"") + 5);
ProductImage = ProductImage.Remove(ProductImage.IndexOf("\""));
try
{
inc.DownFile(ProductImage, ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1));
}
catch (Exception)
{
ErrorStr.Append("下載圖片失敗,圖片地址:" + ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1) + "\r\n");
}
dt2.Rows.Add(NewRow);
//Thread.Sleep(100);
worker.ReportProgress((i + 1) * 100 / Urls.Length, i);
toolStripStatusLabel1.Text = "處理進度:" + (i + 1).ToString() + "/" + Urls.Length.ToString();//進度條
}
}
catch (Exception err)
{
ErrorStr.Append("采集錯誤:" + err.Message + ";網(wǎng)址:" + Urls[i] + "\r\n");
}
}
da.Update(dt2);
DataBind(dt2);
ShowError(ErrorStr.ToString());
}
/// <summary>
/// ASPX頁面生成靜態(tài)Html頁面,作者:鄭少群
/// </summary>
public static string GetHtml(string url)
{
StreamReader sr = null;
string str = null;
//讀取遠程路徑
WebRequest request = WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(response.CharacterSet));
str = sr.ReadToEnd();
sr.Close();
return str;
}
// 提取HTML代碼中的網(wǎng)址
public static ArrayList GetMatchesStr(string htmlCode, string strRegex)
{
ArrayList al = new ArrayList();
Regex r = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline);
MatchCollection m = r.Matches(htmlCode);
for (int i = 0; i < m.Count; i++)
{
bool rep = false;
string strNew = m[i].ToString();
// 過濾重復的URL
foreach (string str in al)
{
if (strNew == str)
{
rep = true;
break;
}
}
if (!rep) al.Add(strNew);
}
al.Sort();
return al;
}
public static void DownFile(string Url, string Path)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
long size = response.ContentLength;
//創(chuàng)建文件流對象
using (FileStream fs = new FileStream(Path, FileMode.OpenOrCreate, FileAccess.Write))
{
byte[] b = new byte[1025];
int n = 0;
while ((n = stream.Read(b, 0, 1024)) > 0)
{
fs.Write(b, 0, n);
}
}
}
相關(guān)文章
DropDownList獲取的SelectIndex一直為0的問題
由于初始化判斷出錯導致每次傳到服務器的時候會初始化一次,這就導致每次獲取DropDownList的SelectIndex的時候只能是02014-06-06asp.net通過js實現(xiàn)Cookie創(chuàng)建以及清除Cookie數(shù)組的代碼
asp.net Cookie創(chuàng)建以及清除Cookie數(shù)組2010-03-03.NET?6開發(fā)TodoList應用之實現(xiàn)Repository模式
這篇文章主要介紹了如何實現(xiàn)一個可重用的Repository模塊。文中的示例代碼講解詳細,對我們學習或工作有一定的幫助,感興趣的小伙伴可以跟隨小編一起學習一下2021-12-12asp.net中使用DatagridView的增刪改方法具體實現(xiàn)
asp.net中使用DatagridView的增刪改方法具體實現(xiàn),需要的朋友可以參考一下2013-06-06ASP.NET MVC中使用jQuery時的瀏覽器緩存問題詳解
這篇文章主要介紹了ASP.NET MVC中使用jQuery時的瀏覽器緩存問題詳解,需要的朋友可以參考下。2016-06-06ASP.NET MVC3 SEO優(yōu)化:利用Routing特性提高站點權(quán)重
這篇文章主要介紹了ASP.NET MVC3 SEO優(yōu)化:利用Routing特性消除多個路徑指向同一個Action,從而提高站點權(quán)重,需要的朋友可以參考下。2016-06-06