-
Notifications
You must be signed in to change notification settings - Fork 0
/
Udacity_crawler_max_depth.py
152 lines (137 loc) · 5.84 KB
/
Udacity_crawler_max_depth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#
# This question explores a different way (from the previous question)
# to limit the pages that it can crawl.
#
#######
# THREE GOLD STARS #
# Yes, we really mean it! This is really tough (but doable) unless
# you have some previous experience before this course.
# Modify the crawl_web procedure to take a second parameter,
# max_depth, that limits the depth of the search. We can
# define the depth of a page as the number of links that must
# be followed to reach that page starting from the seed page,
# that is, the length of the shortest path from the seed to
# the page. No pages whose depth exceeds max_depth should be
# included in the crawl.
#
# For example, if max_depth is 0, the only page that should
# be crawled is the seed page. If max_depth is 1, the pages
# that should be crawled are the seed page and every page that
# it links to directly. If max_depth is 2, the crawl should
# also include all pages that are linked to by these pages.
#
# Note that the pages in the crawl may be in any order.
#
# The following definition of get_page provides an interface
# to the website found at http://www.udacity.com/cs101x/index.html
# The function output order does not affect grading.
def get_page(url):
try:
if url == "http://www.udacity.com/cs101x/index.html":
return ('<html> <body> This is a test page for learning to crawl! '
'<p> It is a good idea to '
'<a href="http://www.udacity.com/cs101x/crawling.html">learn to '
'crawl</a> before you try to '
'<a href="http://www.udacity.com/cs101x/walking.html">walk</a> '
'or <a href="http://www.udacity.com/cs101x/flying.html">fly</a>. '
'</p> </body> </html> ')
elif url == "http://www.udacity.com/cs101x/crawling.html":
return ('<html> <body> I have not learned to crawl yet, but I '
'am quite good at '
'<a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.'
'</body> </html>')
elif url == "http://www.udacity.com/cs101x/walking.html":
return ('<html> <body> I cant get enough '
'<a href="http://www.udacity.com/cs101x/index.html">crawling</a>! '
'</body> </html>')
elif url == "http://www.udacity.com/cs101x/flying.html":
return ('<html> <body> The magic words are Squeamish Ossifrage! '
'</body> </html>')
elif url == "http://top.contributors/velak.html":
return ('<a href="http://top.contributors/jesyspa.html">'
'<a href="http://top.contributors/forbiddenvoid.html">')
elif url == "http://top.contributors/jesyspa.html":
return ('<a href="http://top.contributors/elssar.html">'
'<a href="http://top.contributors/kilaws.html">')
elif url == "http://top.contributors/forbiddenvoid.html":
return ('<a href="http://top.contributors/charlzz.html">'
'<a href="http://top.contributors/johang.html">'
'<a href="http://top.contributors/graemeblake.html">')
elif url == "http://top.contributors/kilaws.html":
return ('<a href="http://top.contributors/tomvandenbosch.html">'
'<a href="http://top.contributors/mathprof.html">')
elif url == "http://top.contributors/graemeblake.html":
return ('<a href="http://top.contributors/dreyescat.html">'
'<a href="http://top.contributors/angel.html">')
elif url == "A1":
return '<a href="B1"> <a href="C1"> '
elif url == "B1":
return '<a href="E1">'
elif url == "C1":
return '<a href="D1">'
elif url == "D1":
return '<a href="E1"> '
elif url == "E1":
return '<a href="F1"> '
except:
return ""
return ""
def get_next_target(page):
start_link = page.find('<a href=')
if start_link == -1:
return None, 0
start_quote = page.find('"', start_link)
end_quote = page.find('"', start_quote + 1)
url = page[start_quote + 1:end_quote]
return url, end_quote
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def get_all_links(page):
links = []
while True:
url,endpos = get_next_target(page)
if url:
links.append(url)
page = page[endpos:]
else:
break
return links
def crawl_web(seed,max_depth):
tocrawl = [seed]
crawled = []
new_depth = []
depth_counter = 0
while tocrawl and depth_counter <= max_depth:
page = tocrawl.pop()
if page not in crawled:
union(new_depth, get_all_links(get_page(page)))
crawled.append(page)
if not tocrawl:
tocrawl, new_depth = new_depth, []
depth_counter += 1
return crawled
#print crawl_web("http://www.udacity.com/cs101x/index.html",0)
#>>> ['http://www.udacity.com/cs101x/index.html']
#print crawl_web("http://www.udacity.com/cs101x/index.html",1)
#>>> ['http://www.udacity.com/cs101x/index.html',
#>>> 'http://www.udacity.com/cs101x/flying.html',
#>>> 'http://www.udacity.com/cs101x/walking.html',
#>>> 'http://www.udacity.com/cs101x/crawling.html']
#print crawl_web("http://www.udacity.com/cs101x/index.html",50)
#>>> ['http://www.udacity.com/cs101x/index.html',
#>>> 'http://www.udacity.com/cs101x/flying.html',
#>>> 'http://www.udacity.com/cs101x/walking.html',
#>>> 'http://www.udacity.com/cs101x/crawling.html',
#>>> 'http://www.udacity.com/cs101x/kicking.html']
#print crawl_web("http://top.contributors/forbiddenvoid.html",2)
#>>> ['http://top.contributors/forbiddenvoid.html',
#>>> 'http://top.contributors/graemeblake.html',
#>>> 'http://top.contributors/angel.html',
#>>> 'http://top.contributors/dreyescat.html',
#>>> 'http://top.contributors/johang.html',
#>>> 'http://top.contributors/charlzz.html']
print crawl_web("A1",3)
#>>> ['A1', 'C1', 'B1', 'E1', 'D1', 'F1']
# (May be in any order)