复习:Python爬虫介绍
需求:爬取三国演义小说所有的章节标题和章节内容
1
| http://www.shicimingju.com/book/sanguoyanyi.html
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
|
import requests from bs4 import BeautifulSoup
if __name__ == "__main__": headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36' } url = 'http://www.shicimingju.com/book/sanguoyanyi.html' page_text = requests.get(url=url, headers=headers) page_text.encoding = 'utf-8' page_text = page_text.text
soup = BeautifulSoup(page_text, 'lxml') li_list = soup.select('.book-mulu > ul > li') fp = open('./sanguo.txt', 'w', encoding='utf-8') for li in li_list: title = li.a.string detail_url = 'http://www.shicimingju.com'+li.a['href']
detail_page_text = requests.get(url=detail_url, headers=headers) detail_page_text.encoding = 'utf-8' detail_page_text = detail_page_text.text
detail_soup = BeautifulSoup(detail_page_text, 'lxml') div_tag = detail_soup.find('div', class_='chapter_content') content = div_tag.text fp.write('《'+title+'》\n'+content+'\n') print(title, '爬取成功!!!')
|
Tips:
Please indicate the source and original author when reprinting or quoting this article.