使用Go colly抓取PTT Beauty版的內容。
範例環境:
- Go 1.19
- github.com/gocolly/colly
事前要求
參考「Golang 網頁爬蟲範例」安裝colly及了解基本用法。
範例一
下面使用colly爬取PTT Beauty板首頁https://www.ptt.cc/bbs/Beauty/index.html
中每篇文章的標題。
呼叫colly.Post
先導向https://www.ptt.cc/ask/over18
頁面詢問是否已滿18歲,當點選「是」的時候夾帶以下form data給PTT,然後才會導向原本要去的看版頁面https://www.ptt.cc/bbs/Beauty/index.html
{
from: "/bbs/Beauty/index.html",
yes: "yes"
}
在colly.OnHTML
取得回應網頁中文章的標題,選擇器為尋找<div class="title"><a/><div>
元素的文字內容。
main.go
package main
import (
"fmt"
"github.com/gocolly/colly/v2"
)
func main() {
c := colly.NewCollector()
c.Post("https://www.ptt.cc/ask/over18", map[string]string{
"from": "/bbs/Beauty/index.html",
"yes": "yes",
})
c.OnHTML("div.title", func(e *colly.HTMLElement) {
e.ForEach("a", func(i int, a *colly.HTMLElement) {
fmt.Println(a.Text)
})
})
c.Visit("https://www.ptt.cc/bbs/Beauty/index.html")
}
範例二
下面改爬取每篇文章的連結,呼叫colly.Collector.Async
使用非同步HTTP請求。
建立urls
slice在colly.OnHTML
搜集每篇文章的連結,使用colly.HTMLElement.Attr
傳入屬性名稱"href"
取得<a>
的連結位址。。
最後呼叫colly.Collector.Wait()
等待非同步請求結束。
main.go
package main
import (
"fmt"
"strings"
"github.com/gocolly/colly/v2"
)
func main() {
c := colly.NewCollector()
c.Async = true
c.Post("https://www.ptt.cc/ask/over18", map[string]string{
"from": "/bbs/Beauty/index.html",
"yes": "yes",
})
var urls []string // collect url of the posts
c.OnHTML("div.title", func(e *colly.HTMLElement) {
e.ForEach("a", func(i int, a *colly.HTMLElement) {
urls = append(urls, a.Attr("href"))
})
})
c.Visit("https://www.ptt.cc/bbs/Beauty/index.html")
c.Wait() // wait for aysnc http call
fmt.Println(strings.Join(urls, "\n"))
}
測試
執行印出以下。
/bbs/Beauty/M.1674325766.A.AF0.html
/bbs/Beauty/M.1674328839.A.099.html
/bbs/Beauty/M.1663845030.A.EF9.html
/bbs/Beauty/M.1621836193.A.468.html
/bbs/Beauty/M.1666371664.A.B29.html
範例三
下面爬取每一頁的連結。
建立pageUrls []string
slice在colly.OnHTML
遞迴呼叫colly.Visit
搜集每上一頁的連結。
main.go
package main
import (
"fmt"
"strings"
"github.com/gocolly/colly/v2"
)
func main() {
c := colly.NewCollector()
c.Async = true
c.Post("https://www.ptt.cc/ask/over18", map[string]string{
"from": "/bbs/Beauty/index.html",
"yes": "yes",
})
var pageUrls []string
next(c, &pageUrls, 0, 3) // max recursive call 3 times
c.Visit("https://www.ptt.cc/bbs/Beauty/index.html")
c.Wait()
fmt.Println(pageUrls)
}
func next(c *colly.Collector, pageUrls *[]string, num, max int) {
c.OnHTML("div.btn-group.btn-group-paging", func(d *colly.HTMLElement) {
num++
if num > max {
return
}
d.ForEach("a.btn.wide", func(i int, a *colly.HTMLElement) {
if strings.Contains(a.Text, "上頁") {
pageUrl := a.Attr("href")
*pageUrls = append(*pageUrls, pageUrl)
pageUrl = "https://www.ptt.cc" + pageUrl
c.Visit(pageUrl)
}
})
})
}
測試
執行印出以下。
/bbs/Beauty/index3999.html
/bbs/Beauty/index3998.html
/bbs/Beauty/index3997.html
範例四
綜合範例二、三,下面爬取每頁每則PO文的連結。
main.go
package main
import (
"fmt"
"strings"
"time"
"github.com/gocolly/colly/v2"
)
func main() {
c := newOver18AsyncCollector()
pageUrls := collectPageUrls(c, 3)
fmt.Println(strings.Join(pageUrls, "\n"))
c = newOver18AsyncCollector()
var postUrls []string
for _, pageUrl := range pageUrls {
postUrls = append(postUrls, collectPostUrls(c, pageUrl)...)
}
fmt.Println(strings.Join(postUrls, "\n"))
}
func newOver18AsyncCollector() *colly.Collector {
c := colly.NewCollector(
colly.Async(true),
colly.UserAgent("Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36"),
)
c.Post("https://www.ptt.cc/ask/over18", map[string]string{
"from": "/bbs/Beauty/index.html",
"yes": "yes",
})
c.Limit(&colly.LimitRule{
Parallelism: 2,
RandomDelay: 5 * time.Second,
})
return c
}
func collectPageUrls(c *colly.Collector, pageCount int) []string {
var pageUrls []string
nextPage(c, &pageUrls, 0, pageCount)
c.Visit("https://www.ptt.cc/bbs/Beauty/index.html")
c.Wait()
return pageUrls
}
func nextPage(c *colly.Collector, pageUrls *[]string, num, max int) {
c.OnHTML("div.btn-group.btn-group-paging", func(d *colly.HTMLElement) {
num++
if num > max {
return
}
d.ForEach("a.btn.wide", func(i int, a *colly.HTMLElement) {
if strings.Contains(a.Text, "上頁") {
pageUrl := a.Attr("href")
*pageUrls = append(*pageUrls, pageUrl)
pageUrl = "https://www.ptt.cc" + pageUrl
c.Visit(pageUrl)
}
})
})
}
func collectPostUrls(c *colly.Collector, pageUrl string) []string {
var urls []string // collect url of the posts
c.OnHTML("div.title", func(e *colly.HTMLElement) {
e.ForEach("a", func(i int, a *colly.HTMLElement) {
urls = append(urls, a.Attr("href"))
})
c.Visit("https://www.ptt.cc" + pageUrl)
})
c.Wait()
return urls
}
測試
執行印出以下。
/bbs/Beauty/index4001.html
/bbs/Beauty/index4000.html
/bbs/Beauty/index3999.html
/bbs/Beauty/M.1674464781.A.B66.html
/bbs/Beauty/M.1674465227.A.DB7.html
/bbs/Beauty/M.1674465489.A.DCA.html
/bbs/Beauty/M.1674465863.A.F2A.html
/bbs/Beauty/M.1674467603.A.46D.html
/bbs/Beauty/M.1674470766.A.D60.html
/bbs/Beauty/M.1674470771.A.4A9.html
/bbs/Beauty/M.1674475142.A.86A.html
/bbs/Beauty/M.1674475157.A.A72.html
/bbs/Beauty/M.1674476923.A.5B4.html
/bbs/Beauty/M.1674479162.A.047.html
/bbs/Beauty/M.1674482176.A.34C.html
/bbs/Beauty/M.1663845030.A.EF9.html
/bbs/Beauty/M.1621836193.A.468.html
/bbs/Beauty/M.1666371664.A.B29.html
/bbs/Beauty/M.1674400996.A.953.html
/bbs/Beauty/M.1674406878.A.842.html
/bbs/Beauty/M.1674407759.A.9DE.html
/bbs/Beauty/M.1674408364.A.B06.html
/bbs/Beauty/M.1674408977.A.B0F.html
/bbs/Beauty/M.1674409445.A.877.html
/bbs/Beauty/M.1674422217.A.A72.html
/bbs/Beauty/M.1674430155.A.232.html
/bbs/Beauty/M.1674442813.A.0F4.html
/bbs/Beauty/M.1674446871.A.8C6.html
/bbs/Beauty/M.1674451095.A.61B.html
/bbs/Beauty/M.1674452982.A.3EA.html
/bbs/Beauty/M.1674453859.A.D70.html
/bbs/Beauty/M.1674454417.A.FD1.html
/bbs/Beauty/M.1674457089.A.856.html
/bbs/Beauty/M.1674457546.A.C9A.html
/bbs/Beauty/M.1674457686.A.803.html
/bbs/Beauty/M.1674461093.A.6F4.html
沒有留言:
張貼留言