Go語(yǔ)言結(jié)合正則表達(dá)式實(shí)現(xiàn)高效獲取數(shù)據(jù)

更新時(shí)間：2025年04月28日 09:52:34 作者：Ai?編碼

這篇文章主要為大家詳細(xì)介紹了Go語(yǔ)言如何結(jié)合正則表達(dá)式實(shí)現(xiàn)高效獲取數(shù)據(jù),文中的示例代碼講解詳細(xì),感興趣的小伙伴可以跟隨小編一起學(xué)習(xí)一下

基礎(chǔ)網(wǎng)頁(yè)內(nèi)容爬取

1.1 獲取網(wǎng)頁(yè)中所有鏈接

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
)

func main() {
	// 發(fā)送HTTP請(qǐng)求
	resp, err := http.Get("https://example.com")
	if err != nil {
		fmt.Println("HTTP請(qǐng)求失敗:", err)
		return
	}
	defer resp.Body.Close()

	// 讀取響應(yīng)內(nèi)容
	body, err := ioutil.ReadAll(resp.Body)
	if err != nil {
		fmt.Println("讀取響應(yīng)失敗:", err)
		return
	}

	// 編譯正則表達(dá)式，匹配所有a標(biāo)簽的href屬性
	re := regexp.MustCompile(`<a[^>]+href=["'](.*?)["']`)
	matches := re.FindAllStringSubmatch(string(body), -1)

	// 輸出所有鏈接
	fmt.Println("找到的鏈接:")
	for _, match := range matches {
		if len(match) > 1 {
			fmt.Println(match[1])
		}
	}
}

1.2 提取特定模式的文本

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
)

func main() {
	resp, err := http.Get("https://example.com")
	if err != nil {
		fmt.Println("HTTP請(qǐng)求失敗:", err)
		return
	}
	defer resp.Body.Close()

	body, _ := ioutil.ReadAll(resp.Body)

	// 匹配所有<h1>-<h6>標(biāo)簽內(nèi)容
	re := regexp.MustCompile(`<h[1-6][^>]*>(.*?)</h[1-6]>`)
	titles := re.FindAllStringSubmatch(string(body), -1)

	fmt.Println("網(wǎng)頁(yè)標(biāo)題:")
	for _, title := range titles {
		if len(title) > 1 {
			// 去除HTML標(biāo)簽
			cleanTitle := regexp.MustCompile(`<[^>]+>`).ReplaceAllString(title[1], "")
			fmt.Println(cleanTitle)
		}
	}
}

結(jié)構(gòu)化數(shù)據(jù)爬取

2.1 爬取表格數(shù)據(jù)

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
	"strings"
)

func main() {
	resp, err := http.Get("https://example.com/table-page")
	if err != nil {
		fmt.Println("HTTP請(qǐng)求失敗:", err)
		return
	}
	defer resp.Body.Close()

	body, _ := ioutil.ReadAll(resp.Body)
	content := string(body)

	// 匹配整個(gè)表格
	tableRe := regexp.MustCompile(`<table[^>]*>(.*?)</table>`)
	tableMatch := tableRe.FindStringSubmatch(content)
	if len(tableMatch) == 0 {
		fmt.Println("未找到表格")
		return
	}

	tableContent := tableMatch[1]

	// 匹配表格行
	rowRe := regexp.MustCompile(`<tr[^>]*>(.*?)</tr>`)
	rows := rowRe.FindAllStringSubmatch(tableContent, -1)

	// 匹配單元格
	cellRe := regexp.MustCompile(`<t[dh][^>]*>(.*?)</t[dh]>`)

	fmt.Println("表格數(shù)據(jù):")
	for _, row := range rows {
		cells := cellRe.FindAllStringSubmatch(row[1], -1)
		for _, cell := range cells {
			if len(cell) > 1 {
				// 清理單元格內(nèi)容
				cleanCell := strings.TrimSpace(regexp.MustCompile(`<[^>]+>`).ReplaceAllString(cell[1], ""))
				fmt.Printf("%s\t", cleanCell)
			}
		}
		fmt.Println() // 換行
	}
}

2.2 爬取JSON數(shù)據(jù)中的特定字段

package main

import (
	"encoding/json"
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
)

type Product struct {
	Name  string  `json:"name"`
	Price float64 `json:"price"`
}

func main() {
	resp, err := http.Get("https://api.example.com/products")
	if err != nil {
		fmt.Println("HTTP請(qǐng)求失敗:", err)
		return
	}
	defer resp.Body.Close()

	body, _ := ioutil.ReadAll(resp.Body)

	// 方法1：直接解析JSON
	var products []Product
	if err := json.Unmarshal(body, &products); err == nil {
		fmt.Println("產(chǎn)品列表(JSON解析):")
		for _, p := range products {
			fmt.Printf("%s - $%.2f\n", p.Name, p.Price)
		}
		return
	}

	// 方法2：當(dāng)JSON結(jié)構(gòu)不確定時(shí)使用正則
	fmt.Println("\n嘗試使用正則表達(dá)式提取:")

	// 匹配產(chǎn)品名稱和價(jià)格
	re := regexp.MustCompile(`"name"\s*:\s*"([^"]+)"[^}]+"price"\s*:\s*(\d+\.?\d*)`)
	matches := re.FindAllStringSubmatch(string(body), -1)

	for _, match := range matches {
		if len(match) >= 3 {
			fmt.Printf("%s - $%s\n", match[1], match[2])
		}
	}
}

高級(jí)爬蟲(chóng)技巧

3.1 帶并發(fā)控制的爬蟲(chóng)

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
	"sync"
)

func main() {
	urls := []string{
		"https://example.com/page1",
		"https://example.com/page2",
		"https://example.com/page3",
	}

	var wg sync.WaitGroup
	semaphore := make(chan struct{}, 3) // 并發(fā)限制為3

	titleRe := regexp.MustCompile(`<title[^>]*>(.*?)</title>`)

	for _, url := range urls {
		wg.Add(1)
		go func(u string) {
			defer wg.Done()
			semaphore <- struct{}{} // 獲取信號(hào)量

			resp, err := http.Get(u)
			if err != nil {
				fmt.Printf("獲取 %s 失敗: %v\n", u, err)
				<-semaphore
				return
			}

			body, _ := ioutil.ReadAll(resp.Body)
			resp.Body.Close()

			title := titleRe.FindStringSubmatch(string(body))
			if len(title) > 1 {
				fmt.Printf("%s 的標(biāo)題: %s\n", u, title[1])
			}

			<-semaphore // 釋放信號(hào)量
		}(url)
	}

	wg.Wait()
}

3.2 處理分頁(yè)內(nèi)容

package main

import (
	"fmt"
	"io/ioutil"
	"net/http"
	"regexp"
	"strconv"
)

func main() {
	baseURL := "https://example.com/news?page="
	pageRe := regexp.MustCompile(`<div class="news-item">(.*?)</div>`)
	titleRe := regexp.MustCompile(`<h2>(.*?)</h2>`)
	pageNumRe := regexp.MustCompile(`page=(\d+)`)

	// 先獲取總頁(yè)數(shù)
	totalPages := getTotalPages(baseURL + "1")
	
	fmt.Printf("共發(fā)現(xiàn) %d 頁(yè)內(nèi)容\n", totalPages)

	// 爬取每頁(yè)內(nèi)容
	for page := 1; page <= totalPages; page++ {
		url := baseURL + strconv.Itoa(page)
		fmt.Printf("\n正在爬取第 %d 頁(yè): %s\n", page, url)
		
		resp, err := http.Get(url)
		if err != nil {
			fmt.Printf("獲取第 %d 頁(yè)失敗: %v\n", page, err)
			continue
		}

		body, _ := ioutil.ReadAll(resp.Body)
		resp.Body.Close()

		newsItems := pageRe.FindAllStringSubmatch(string(body), -1)
		for _, item := range newsItems {
			if len(item) > 1 {
				title := titleRe.FindStringSubmatch(item[1])
				if len(title) > 1 {
					fmt.Println("新聞標(biāo)題:", title[1])
				}
			}
		}
	}
}

func getTotalPages(url string) int {
	resp, err := http.Get(url)
	if err != nil {
		return 1 // 默認(rèn)1頁(yè)
	}
	defer resp.Body.Close()

	body, _ := ioutil.ReadAll(resp.Body)
	
	// 假設(shè)頁(yè)面中有類似 "共 5 頁(yè)" 的文字
	re := regexp.MustCompile(`共\s*(\d+)\s*頁(yè)`)
	match := re.FindStringSubmatch(string(body))
	if len(match) > 1 {
		total, _ := strconv.Atoi(match[1])
		return total
	}
	
	return 1
}

實(shí)用技巧與注意事項(xiàng)

1.User-Agent設(shè)置：

client := &http.Client{}
req, _ := http.NewRequest("GET", "https://example.com", nil)
req.Header.Set("User-Agent", "Mozilla/5.0 (compatible; MyBot/1.0)")
resp, _ := client.Do(req)

2.處理相對(duì)鏈接：

import "net/url"

base, _ := url.Parse("https://example.com")
rel, _ := url.Parse("/page1")
absURL := base.ResolveReference(rel).String()

3.正則表達(dá)式優(yōu)化：

預(yù)編譯正則表達(dá)式：re := regexp.MustCompile(pattern)

使用非貪婪匹配：.*?

避免過(guò)度復(fù)雜的正則表達(dá)式

4.錯(cuò)誤處理增強(qiáng)：

resp, err := http.Get(url)
if err != nil {
    return fmt.Errorf("請(qǐng)求失敗: %w", err)
}
defer func() {
    if err := resp.Body.Close(); err != nil {
        log.Printf("關(guān)閉響應(yīng)體失敗: %v", err)
    }
}()

反爬蟲(chóng)策略應(yīng)對(duì)

設(shè)置合理的請(qǐng)求間隔：

import "time"

func crawlWithDelay(urls []string, delay time.Duration) {
    for _, url := range urls {
        go crawlPage(url)
        time.Sleep(delay)
    }
}

使用代理IP：

proxyUrl, _ := url.Parse("http://proxy-ip:port")
client := &http.Client{
    Transport: &http.Transport{
        Proxy: http.ProxyURL(proxyUrl),
    },
}
resp, _ := client.Get("https://example.com")

處理Cookies：

jar, _ := cookiejar.New(nil)
client := &http.Client{Jar: jar}
// 第一次請(qǐng)求獲取cookie
client.Get("https://example.com/login")
// 后續(xù)請(qǐng)求會(huì)攜帶cookie
client.Get("https://example.com/protected-page")

總結(jié)

以上實(shí)例展示了Go語(yǔ)言結(jié)合正則表達(dá)式進(jìn)行數(shù)據(jù)爬取的多種方法：

基礎(chǔ)網(wǎng)頁(yè)爬?。韩@取鏈接、提取特定內(nèi)容
結(jié)構(gòu)化數(shù)據(jù)提?。罕砀駭?shù)據(jù)、JSON數(shù)據(jù)
高級(jí)技巧：并發(fā)控制、分頁(yè)處理
實(shí)用技巧：User-Agent設(shè)置、相對(duì)鏈接處理
反爬應(yīng)對(duì)：請(qǐng)求間隔、代理IP、Cookies處理

在實(shí)際項(xiàng)目中，建議：

對(duì)于結(jié)構(gòu)化數(shù)據(jù)優(yōu)先使用API而非HTML解析
復(fù)雜的HTML解析考慮使用goquery等專門庫(kù)
遵守網(wǎng)站的robots.txt規(guī)則
設(shè)置合理的爬取頻率，避免對(duì)目標(biāo)網(wǎng)站造成負(fù)擔(dān)

這些實(shí)例可以作為基礎(chǔ)模板，根據(jù)具體需求進(jìn)行調(diào)整和擴(kuò)展。

到此這篇關(guān)于Go語(yǔ)言結(jié)合正則表達(dá)式實(shí)現(xiàn)高效獲取數(shù)據(jù)的文章就介紹到這了,更多相關(guān)Go獲取數(shù)據(jù)內(nèi)容請(qǐng)搜索腳本之家以前的文章或繼續(xù)瀏覽下面的相關(guān)文章希望大家以后多多支持腳本之家！

您可能感興趣的文章: