forked from PuerkitoBio/gocrawl
-
Notifications
You must be signed in to change notification settings - Fork 1
/
ext.go
188 lines (163 loc) · 6.88 KB
/
ext.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
package gocrawl
import (
"errors"
"log"
"net/http"
"time"
"github.com/PuerkitoBio/goquery"
)
// DelayInfo contains the delay configuration: the Options delay, the
// Robots.txt delay, and the last delay used.
type DelayInfo struct {
OptsDelay time.Duration
RobotsDelay time.Duration
LastDelay time.Duration
}
// FetchInfo contains the fetch information: the duration of the fetch,
// the returned status code, whether or not it was a HEAD request,
// and whether or not it was a robots.txt request.
type FetchInfo struct {
Ctx *URLContext
Duration time.Duration
StatusCode int
IsHeadRequest bool
}
// Extender defines the extension methods required by the crawler.
type Extender interface {
// Start, End, Error and Log are not related to a specific URL, so they don't
// receive a URLContext struct.
Start(interface{}) interface{}
End(error)
Error(*CrawlError)
Log(LogFlags, LogFlags, string)
// ComputeDelay is related to a Host only, not to a URLContext, although the FetchInfo
// is related to a URLContext (holds a ctx field).
ComputeDelay(string, *DelayInfo, *FetchInfo) time.Duration
// All other extender methods are executed in the context of an URL, and thus
// receive an URLContext struct as first argument.
Fetch(*URLContext, string, bool) (*http.Response, error)
RequestGet(*URLContext, *http.Response) bool
RequestRobots(*URLContext, string) ([]byte, bool)
FetchedRobots(*URLContext, *http.Response)
Filter(*URLContext, bool) bool
Enqueued(*URLContext)
Visit(*URLContext, *http.Response, *goquery.Document) (interface{}, bool)
Visited(*URLContext, interface{})
Disallowed(*URLContext)
}
// HttpClient is the default HTTP client used by DefaultExtender's fetch
// requests (this is thread-safe). The client's fields can be customized
// (i.e. for a different redirection strategy, a different Transport
// object, ...). It should be done prior to starting the crawler.
var HttpClient = &http.Client{CheckRedirect: func(req *http.Request, via []*http.Request) error {
// For robots.txt URLs, allow up to 10 redirects, like the default http client.
// Rationale: the site owner explicitly tells us that this specific robots.txt
// should be used for this domain.
if isRobotsURL(req.URL) {
if len(via) >= 10 {
return errors.New("stopped after 10 redirects")
}
if len(via) > 0 {
req.Header.Set("User-Agent", via[0].Header.Get("User-Agent"))
}
return nil
}
// For all other URLs, do NOT follow redirections, the default Fetch() implementation
// will ask the worker to enqueue the new (redirect-to) URL. Returning an error
// will make httpClient.Do() return a url.Error, with the URL field containing the new URL.
return ErrEnqueueRedirect
}}
// DefaultExtender is a default working implementation of an extender. It is
// possible to nest such a value in a custom struct so that only the
// Extender methods that require custom behaviour have to be implemented.
type DefaultExtender struct {
EnqueueChan chan<- interface{}
}
// Start returns the same seeds as those received (those that were passed
// to Run initially).
func (de *DefaultExtender) Start(seeds interface{}) interface{} {
return seeds
}
// End is a no-op.
func (de *DefaultExtender) End(err error) {}
// Error is a no-op (logging is done automatically, regardless of the implementation
// of the Error hook).
func (de *DefaultExtender) Error(err *CrawlError) {}
// Log prints to the standard error by default, based on the requested log verbosity.
func (de *DefaultExtender) Log(logFlags LogFlags, msgLevel LogFlags, msg string) {
if logFlags&msgLevel == msgLevel {
log.Println(msg)
}
}
// ComputeDelay returns the delay specified in the Crawler's Options, unless a
// crawl-delay is specified in the robots.txt file, which has precedence.
func (de *DefaultExtender) ComputeDelay(host string, di *DelayInfo, lastFetch *FetchInfo) time.Duration {
if di.RobotsDelay > 0 {
return di.RobotsDelay
}
return di.OptsDelay
}
// Fetch requests the specified URL using the given user agent string. It uses
// a custom http Client instance that doesn't follow redirections. Instead, the
// redirected-to URL is enqueued so that it goes through the same Filter and
// Fetch process as any other URL.
//
// Two options were considered for the default Fetch implementation :
// 1- Not following any redirections, and enqueuing the redirect-to URL,
// failing the current call with the 3xx status code.
//
// 2- Following all redirections, enqueuing only the last one (where redirection
// stops). Returning the response of the next-to-last request.
//
// Ultimately, 1) was implemented, as it is the most generic solution that makes
// sense as default for the library. It involves no "magic" and gives full control
// as to what can happen, with the disadvantage of having the Filter being aware
// of all possible intermediary URLs before reaching the final destination of
// a redirection (i.e. if A redirects to B that redirects to C, Filter has to
// allow A, B, and C to be Fetched, while solution 2 would only have required
// Filter to allow A and C).
//
// Solution 2) also has the disadvantage of fetching twice the final URL (once
// while processing the original URL, so that it knows that there is no more
// redirection HTTP code, and another time when the actual destination URL is
// fetched to be visited).
func (de *DefaultExtender) Fetch(ctx *URLContext, userAgent string, headRequest bool) (*http.Response, error) {
var reqType string
// Prepare the request with the right user agent
if headRequest {
reqType = "HEAD"
} else {
reqType = "GET"
}
req, e := http.NewRequest(reqType, ctx.url.String(), nil)
if e != nil {
return nil, e
}
req.Header.Set("User-Agent", userAgent)
return HttpClient.Do(req)
}
// RequestGet asks the worker to actually request the URL's body
// (issue a GET), unless the status code is not 2xx.
func (de *DefaultExtender) RequestGet(ctx *URLContext, headRes *http.Response) bool {
return headRes.StatusCode >= 200 && headRes.StatusCode < 300
}
// RequestRobots asks the worker to actually request (fetch) the robots.txt.
func (de *DefaultExtender) RequestRobots(ctx *URLContext, robotAgent string) (data []byte, doRequest bool) {
return nil, true
}
// FetchedRobots is a no-op.
func (de *DefaultExtender) FetchedRobots(ctx *URLContext, res *http.Response) {}
// Filter enqueues the URL if it hasn't been visited yet.
func (de *DefaultExtender) Filter(ctx *URLContext, isVisited bool) bool {
return !isVisited
}
// Enqueued is a no-op.
func (de *DefaultExtender) Enqueued(ctx *URLContext) {}
// Visit asks the worker to harvest the links in this page.
func (de *DefaultExtender) Visit(ctx *URLContext, res *http.Response, doc *goquery.Document) (harvested interface{}, findLinks bool) {
return nil, true
}
// Visited is a no-op.
func (de *DefaultExtender) Visited(ctx *URLContext, harvested interface{}) {}
// Disallowed is a no-op.
func (de *DefaultExtender) Disallowed(ctx *URLContext) {}