Skip to content

Latest commit

 

History

History
140 lines (112 loc) · 2.89 KB

golang-spider-colly.md

File metadata and controls

140 lines (112 loc) · 2.89 KB

pkg:

"github.com/gocolly/colly"


go get "..."

main.go

package main

import (
	"fmt"
	"github.com/gocolly/colly"
	"github.com/gocolly/colly/extensions"
	_ "github.com/gocolly/colly/extensions"
	"time"
)

func main() {
	url := "https://v.qq.com/"
	client := colly.NewCollector(
		colly.UserAgent("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"),
		//colly.MaxDepth(1),
		//colly.Debugger(&debug.LogDebugger{}),  //"github.com/gocolly/colly/debug"
	)
	client2 := client.Clone()

	// 异步
	client2.Async = true

	// 限速
	err := client2.Limit(&colly.LimitRule{
		DomainRegexp: "",
		DomainGlob:   "*v.qq.com/x/cover/*",
		Delay:        10 * time.Second,
		RandomDelay:  0,
		Parallelism:  1,
	})
	if err != nil {
		return
	}

	// 采集器  节目链接
	client.OnHTML("div[class='mod_column_bd']", func(e *colly.HTMLElement) {
		e.ForEach("a", func(i int, item *colly.HTMLElement) {
			href := item.Attr("href")
			ctx := colly.NewContext()
			ctx.Put("href", href)
			err := client2.Request("GET", href, nil, ctx, nil)
			if err != nil {
				return 
			}
			fmt.Println(href)

		})

	})

	//设置随机useragent
	extensions.RandomUserAgent(client)
	//设置登录cookie
	//err2 := client.SetCookies(url, []*http.Cookie{
	//	{
	//		Name:     "remember_user_token",
	//		Value:    "wNDUxOV0sIiQyYSQxMSRwdkhqWVhHYmxXaDJ6dEU3NzJwbmsuIiwiMTU",
	//		Path:     "/",
	//		Domain:   ".jianshu.com",
	//		Secure:   true,
	//		HttpOnly: true,
	//	},
	//})
	//if err2 != nil {
	//	return
	//}

	// 采集器2 节目详情
	client2.OnHTML("div[class='mod_row_box']", func(e *colly.HTMLElement) {
		title := e.ChildText("h2[class='title']")
		actor := e.ChildText("div[class='director']")
		description := e.ChildText("p[class='summary']")
		fmt.Println(title)
		fmt.Println(actor)
		fmt.Println(description)
		fmt.Println()

	})


	client2.OnRequest(func(r *colly.Request) {
		fmt.Println("c2爬取页面:", r.URL)
	})

	client.OnRequest(func(r *colly.Request) {
		fmt.Println("c1爬取页面:", r.URL)

	})
	client.OnError(func(r *colly.Response, err error) {
		fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
	})

	if client.Visit(url) != nil {
		fmt.Println(err.Error())
	}

	client2.Wait()

}

生成go.mod

go mod init spider-test
go mod tidy

go.mod

module spider-test

go 1.16

require (
	github.com/PuerkitoBio/goquery v1.7.1 // indirect
	github.com/antchfx/htmlquery v1.2.3 // indirect
	github.com/antchfx/xmlquery v1.3.6 // indirect
	github.com/gobwas/glob v0.2.3 // indirect
	github.com/gocolly/colly v1.2.0
	github.com/kennygrant/sanitize v1.2.4 // indirect
	github.com/saintfish/chardet v0.0.0-20120816061221-3af4cd4741ca // indirect
	github.com/temoto/robotstxt v1.1.2 // indirect
	golang.org/x/net v0.0.0-20210813160813-60bc85c4be6d // indirect
	google.golang.org/appengine v1.6.7 // indirect
)