python crawler

1

http://www.heibanke.com/lesson/crawler_ex00/

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import urllib.request
import re

url = "http://www.heibanke.com/lesson/crawler_ex00/"
data = urllib.request.urlopen(url).read()
data = data.decode('UTF-8')
index = re.findall(r'数字([\d]{5})', data)

while index:
url = "http://www.heibanke.com/lesson/crawler_ex00/%s/" % index[0]
data = urllib.request.urlopen(url).read()
data = data.decode('UTF-8')
index = re.findall(r'数字是([\d]{5})', data)

print(data)

2

http://www.heibanke.com/lesson/crawler_ex01/

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import requests
import re

url = "http://www.heibanke.com/lesson/crawler_ex01/"
# url = "https://www.baidu.com/"
index = 0
data = {'username':'admin'}
sol = 1

while sol:
data['password'] = index
html = requests.post(url, data).text
sol = re.findall(r'密码错误', html)
index += 1
print(index)

print(html, index - 1)

3

http://www.heibanke.com/lesson/crawler_ex02/

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import requests
import re
url1 = 'http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex02/'
url2 = 'http://www.heibanke.com/lesson/crawler_ex02/'
temp = requests.session()
temp.get(url1)
token = temp.cookies['csrftoken']

index = 0
data= {'username':'admin','password':123456,'csrfmiddlewaretoken':token}
temp.post(url1,data)
sol = 1

temp.get(url2)
token = temp.cookies['csrftoken']
data= {'username':'admin','password':0,'csrfmiddlewaretoken':token}

while sol:
index += 1
data['password'] = index
html = temp.post(url2,data).text
sol = re.findall(r'密码错误',html)
print(index)

print(index,html)

4

http://www.heibanke.com/lesson/crawler_ex03/

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import requests
import re
from threading import Thread
import time

url1 = 'http://www.heibanke.com/accounts/login/?next=/lesson/crawler_ex03/'
url2 = 'http://www.heibanke.com/lesson/crawler_ex03/'
url3 = 'http://www.heibanke.com/lesson/crawler_ex03/pw_list/'

temp = requests.session()
temp.get(url1)
token = temp.cookies['csrftoken']
data = {'username':'admin','password':'123456','csrfmiddlewaretoken':token}
temp.post(url1,data)

temp.get(url2)
token = temp.cookies['csrftoken']
data['csrfmiddlewaretoken'] = token

password={}

def loop(passwd):
html = temp.get(url3)
pos = re.findall(r'password_pos">([0-9]*)</td>',html.text)
val = re.findall(r'password_val">([0-9]*)</td>',html.text)
for i in range(len(pos)):
# if val[i] not in passwd:
passwd[int(pos[i]) - 1] = val[i]
print(passwd)
print(len(passwd))

passwd = ['' for i in range(100)]
T = ['t1','t2']
while '' in passwd:
for t in T:
t = Thread(target=loop(passwd))
t.start()
time.sleep(8)
for i in T:
t.join()

passwd = ''.join(passwd)
print(passwd)

github

github:https://github.com/ChrisX2016/Web_crawler/tree/master/chuangguan