2023/1/21

Golang 抓取PTT Beauty板網頁資料

使用Go colly抓取PTT Beauty版的內容。

範例環境：

Go 1.19
github.com/gocolly/colly

事前要求

參考「Golang 網頁爬蟲範例」安裝colly及了解基本用法。

範例一

下面使用colly爬取PTT Beauty板首頁https://www.ptt.cc/bbs/Beauty/index.html中每篇文章的標題。

呼叫colly.Post先導向https://www.ptt.cc/ask/over18頁面詢問是否已滿18歲，當點選「是」的時候夾帶以下form data給PTT，然後才會導向原本要去的看版頁面https://www.ptt.cc/bbs/Beauty/index.html

。

{
    from: "/bbs/Beauty/index.html",
    yes: "yes"
}

在colly.OnHTML取得回應網頁中文章的標題，選擇器為尋找<div class="title"><a/><div>元素的文字內容。

main.go

package main

import (
    "fmt"

    "github.com/gocolly/colly/v2"
)

func main() {
    c := colly.NewCollector()

    c.Post("https://www.ptt.cc/ask/over18", map[string]string{
        "from": "/bbs/Beauty/index.html",
        "yes":  "yes",
    })

    c.OnHTML("div.title", func(e *colly.HTMLElement) {
        e.ForEach("a", func(i int, a *colly.HTMLElement) {
            fmt.Println(a.Text)
        })
    })

    c.Visit("https://www.ptt.cc/bbs/Beauty/index.html")
}

github。

範例二

下面改爬取每篇文章的連結，呼叫colly.Collector.Async使用非同步HTTP請求。

建立urls slice在colly.OnHTML搜集每篇文章的連結，使用colly.HTMLElement.Attr傳入屬性名稱"href"取得<a>的連結位址。。

最後呼叫colly.Collector.Wait()等待非同步請求結束。

main.go

package main

import (
    "fmt"
    "strings"

    "github.com/gocolly/colly/v2"
)

func main() {
    c := colly.NewCollector()
    c.Async = true

    c.Post("https://www.ptt.cc/ask/over18", map[string]string{
        "from": "/bbs/Beauty/index.html",
        "yes":  "yes",
    })

    var urls []string // collect url of the posts
    c.OnHTML("div.title", func(e *colly.HTMLElement) {
        e.ForEach("a", func(i int, a *colly.HTMLElement) {
            urls = append(urls, a.Attr("href"))
        })
    })

    c.Visit("https://www.ptt.cc/bbs/Beauty/index.html")
    c.Wait() // wait for aysnc http call

    fmt.Println(strings.Join(urls, "\n"))

}

測試

執行印出以下。

/bbs/Beauty/M.1674325766.A.AF0.html
/bbs/Beauty/M.1674328839.A.099.html
/bbs/Beauty/M.1663845030.A.EF9.html
/bbs/Beauty/M.1621836193.A.468.html
/bbs/Beauty/M.1666371664.A.B29.html

範例三

下面爬取每一頁的連結。

建立pageUrls []string slice在colly.OnHTML遞迴呼叫colly.Visit搜集每上一頁的連結。

main.go

package main

import (
    "fmt"
    "strings"

    "github.com/gocolly/colly/v2"
)

func main() {
    c := colly.NewCollector()
    c.Async = true

    c.Post("https://www.ptt.cc/ask/over18", map[string]string{
        "from": "/bbs/Beauty/index.html",
        "yes":  "yes",
    })

    var pageUrls []string

    next(c, &pageUrls, 0, 3) // max recursive call 3 times

    c.Visit("https://www.ptt.cc/bbs/Beauty/index.html")
    c.Wait()
    fmt.Println(pageUrls)

}

func next(c *colly.Collector, pageUrls *[]string, num, max int) {
    c.OnHTML("div.btn-group.btn-group-paging", func(d *colly.HTMLElement) {
        num++
        if num > max {
            return
        }
        d.ForEach("a.btn.wide", func(i int, a *colly.HTMLElement) {
            if strings.Contains(a.Text, "上頁") {
                pageUrl := a.Attr("href")
                *pageUrls = append(*pageUrls, pageUrl)
                pageUrl = "https://www.ptt.cc" + pageUrl
                c.Visit(pageUrl)
            }
        })
    })
}

github。

測試

執行印出以下。

/bbs/Beauty/index3999.html
/bbs/Beauty/index3998.html
/bbs/Beauty/index3997.html

範例四

綜合範例二、三，下面爬取每頁每則PO文的連結。

main.go

package main

import (
    "fmt"
    "strings"
    "time"

    "github.com/gocolly/colly/v2"
)

func main() {
    c := newOver18AsyncCollector()
    pageUrls := collectPageUrls(c, 3)
    fmt.Println(strings.Join(pageUrls, "\n"))

    c = newOver18AsyncCollector()
    var postUrls []string
    for _, pageUrl := range pageUrls {
        postUrls = append(postUrls, collectPostUrls(c, pageUrl)...)
    }
    fmt.Println(strings.Join(postUrls, "\n"))
}

func newOver18AsyncCollector() *colly.Collector {
    c := colly.NewCollector(
        colly.Async(true),
        colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"),
    )
    c.Post("https://www.ptt.cc/ask/over18", map[string]string{
        "from": "/bbs/Beauty/index.html",
        "yes":  "yes",
    })
    c.Limit(&colly.LimitRule{
        Parallelism: 2,
        RandomDelay: 5 * time.Second,
    })
    return c
}

func collectPageUrls(c *colly.Collector, pageCount int) []string {
    var pageUrls []string
    nextPage(c, &pageUrls, 0, pageCount)
    c.Visit("https://www.ptt.cc/bbs/Beauty/index.html")
    c.Wait()
    return pageUrls
}

func nextPage(c *colly.Collector, pageUrls *[]string, num, max int) {
    c.OnHTML("div.btn-group.btn-group-paging", func(d *colly.HTMLElement) {
        num++
        if num > max {
            return
        }
        d.ForEach("a.btn.wide", func(i int, a *colly.HTMLElement) {
            if strings.Contains(a.Text, "上頁") {
                pageUrl := a.Attr("href")
                *pageUrls = append(*pageUrls, pageUrl)
                pageUrl = "https://www.ptt.cc" + pageUrl
                c.Visit(pageUrl)
            }
        })
    })
}

func collectPostUrls(c *colly.Collector, pageUrl string) []string {
    var urls []string // collect url of the posts
    c.OnHTML("div.title", func(e *colly.HTMLElement) {
        e.ForEach("a", func(i int, a *colly.HTMLElement) {
            urls = append(urls, a.Attr("href"))
        })
        c.Visit("https://www.ptt.cc" + pageUrl)
    })
    c.Wait()
    return urls
}

測試

執行印出以下。

/bbs/Beauty/index4001.html
/bbs/Beauty/index4000.html
/bbs/Beauty/index3999.html
/bbs/Beauty/M.1674464781.A.B66.html
/bbs/Beauty/M.1674465227.A.DB7.html
/bbs/Beauty/M.1674465489.A.DCA.html
/bbs/Beauty/M.1674465863.A.F2A.html
/bbs/Beauty/M.1674467603.A.46D.html
/bbs/Beauty/M.1674470766.A.D60.html
/bbs/Beauty/M.1674470771.A.4A9.html
/bbs/Beauty/M.1674475142.A.86A.html
/bbs/Beauty/M.1674475157.A.A72.html
/bbs/Beauty/M.1674476923.A.5B4.html
/bbs/Beauty/M.1674479162.A.047.html
/bbs/Beauty/M.1674482176.A.34C.html
/bbs/Beauty/M.1663845030.A.EF9.html
/bbs/Beauty/M.1621836193.A.468.html
/bbs/Beauty/M.1666371664.A.B29.html
/bbs/Beauty/M.1674400996.A.953.html
/bbs/Beauty/M.1674406878.A.842.html
/bbs/Beauty/M.1674407759.A.9DE.html
/bbs/Beauty/M.1674408364.A.B06.html
/bbs/Beauty/M.1674408977.A.B0F.html
/bbs/Beauty/M.1674409445.A.877.html
/bbs/Beauty/M.1674422217.A.A72.html
/bbs/Beauty/M.1674430155.A.232.html
/bbs/Beauty/M.1674442813.A.0F4.html
/bbs/Beauty/M.1674446871.A.8C6.html
/bbs/Beauty/M.1674451095.A.61B.html
/bbs/Beauty/M.1674452982.A.3EA.html
/bbs/Beauty/M.1674453859.A.D70.html
/bbs/Beauty/M.1674454417.A.FD1.html
/bbs/Beauty/M.1674457089.A.856.html
/bbs/Beauty/M.1674457546.A.C9A.html
/bbs/Beauty/M.1674457686.A.803.html
/bbs/Beauty/M.1674461093.A.6F4.html

Python 使用 Requests 套件抓取 PTT 網頁資料

沒有留言:

張貼留言

菜鳥工程師肉豬

AdSense

網頁

2023/1/21

Golang 抓取PTT Beauty板網頁資料

事前要求

範例一

main.go

範例二

main.go

測試

範例三

main.go

測試

範例四

main.go

測試

沒有留言:

AdSense

標籤

網誌存檔