-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl.go
115 lines (95 loc) · 2.74 KB
/
crawl.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
package crawlptt
import (
"io/ioutil"
"net/http"
"regexp"
"github.com/PuerkitoBio/goquery"
)
// PttPostInfo includes information of post
type PttPostInfo struct {
Author string
Title string
Link string
}
// PttPost includes content of post
type PttPost struct {
Content string
}
// GetPostInfo parse list of post from certain board for certain pages
func GetPostInfo(board string, pages int) (post []*PttPostInfo, err error) {
url := "https://www.ptt.cc/bbs/" + board + "/index.html"
return GetPostInfoURL(url, pages)
}
// GetPostInfoURL parse list of post from post index url
func GetPostInfoURL(url string, pages int) (post []*PttPostInfo, err error) {
var postList []*PttPostInfo
// Create seperate offer index in case that post are deleted
authorIndex := 0
// Pattern to match href link
pattern := "/bbs/.*/M\\.\\d+\\.A\\..+.html"
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.AddCookie(&http.Cookie{Name: "over18", Value: "1"})
client := &http.Client{}
res, err := client.Do(req)
if err != nil {
return nil, err
}
defer res.Body.Close()
doc, err := goquery.NewDocumentFromReader(res.Body)
// Extract post title and link
doc.Find("a[href]").Each(func(index int, item *goquery.Selection) {
href, _ := item.Attr("href")
matched, err := regexp.MatchString(pattern, href)
if err != nil {
return
}
if matched {
postList = append(postList, &PttPostInfo{Author: "", Title: item.Text(), Link: "https://www.ptt.cc" + href})
}
})
// Extract post Author
doc.Find(".author").Each(func(index int, item *goquery.Selection) {
if item.Text() != "-" {
postList[authorIndex].Author = item.Text()
authorIndex++
}
})
// Extract link to previous page
doc.Find(".btn.wide").Each(func(index int, item *goquery.Selection) {
if item.Text() == "‹ 上頁" {
url, _ = item.Attr("href")
}
})
// Recursively parse previous page until pages is 0
if pages != 0 {
nextPostList, _ := GetPostInfoURL("https://www.ptt.cc"+url, pages-1)
for _, p := range nextPostList {
postList = append(postList, p)
}
}
return postList, nil
}
// GetPost parse post content from post url
func GetPost(url string) (post *PttPost, err error) {
// Pattern to match post content
pattern := regexp.MustCompile("[0-9]{4}</span>[\\s\\S]*--")
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, err
}
req.AddCookie(&http.Cookie{Name: "over18", Value: "1"})
client := &http.Client{}
res, err := client.Do(req)
if err != nil {
return nil, err
}
defer res.Body.Close()
body, err := ioutil.ReadAll(res.Body)
postContent := pattern.FindStringSubmatch(string(body))
// Remove extra data from match
postContent[0] = postContent[0][17:]
return &PttPost{Content: postContent[0]}, nil
}