1. 浏览器伪装技术
2. 爬取的链接内容写入文件中
import urllib.request
import re
url = 'http://blog.csdn.net'
headers = ('User-Agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36')
opener = urllib.request.build_opener()
opener.addheaders=[headers]
data = opener.open(url).read()
pat='<h3 class="tracking-ad" data-mod="popu_254"><a href="https://ask.hellobi.com/(http://blog.csdn.net/.+?)"'
allurl = re.compile(pat).findall(str(data))
num = len(allurl)
for i in range(num):
thisurl = allurl[i]
file = 'blog.csdn/'+str(i)+'.html'
print(file,thisurl)
try:
openerfile = urllib.request.build_opener()
openerfile.addheaders=[headers]
datafile = openerfile.open(thisurl).read()
fh = open(file,'wb')
fh.write(datafile)
fh.close()
except Exception as err:
print(err)