爬虫编码问题

0

import requests import re from requests.exceptions import RequestException import json def get_one_page(url): try: response=requests.get(url) if response.status_code==200: return response.text return None except RequestException: return None def parse_one_url(html): pattern=re.compile('<dd>.*?board-index.*?">(\d+)</i>.*?data-src="(.*?)".*?name"><a' +'.*?>(.*?)</a></p>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' +'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>', re.S) items=re.findall(pattern, html) #print(items) for item in items: yield { 'index': item[0], 'imgurl': item[1], 'title': item[2], 'actor': item[3].strip()[3:], 'releasetime': item[4].strip()[5:], 'point': item[5]+item[6] } def write_to_file(content): with open('result.txt', 'a', encoding='utf-8') as f: f.write(json.dumps(content, ensure_ascii=False) + '\n') f.close() def main(offset): url = "http://maoyan.com/board/4?offset="+str(offset) html = get_one_page(url) for item in parse_one_url(html): print(item) write_to_file(item) if __name__ == '__main__': for i in range(10): main(i*10)
 
Traceback (most recent call last):
  File "D:/天善学习/Python数据分析与实战/python爬虫/maoyantop100/spider.py", line 43, in <module>
    main(i*10)
  File "D:/天善学习/Python数据分析与实战/python爬虫/maoyantop100/spider.py", line 38, in main
    print(item)
UnicodeEncodeError: 'gbk' codec can't encode character '\xf4' in position 135: illegal multibyte sequence
 
 
出现问题问题
已邀请:
0

邓旭东HIT - 爱编程的营销小硕 2017-03-18 回答

代码版面太乱了,能不能修改下

要回复问题请先登录注册