python小试牛刀爬取简书内容
大家好我是图恩,网站前期运行期间有写过一个爬虫去简书爬取前端相关文章,然后每天定时爬取一定数量的文章发布到网站,目的就是为了充实网站内容。
但是前期由于技术不过关导致爬取的内容没有分段,整篇文章就是一段,阅读十分不方便,但是等图恩发现这个问题的时候网站已经运行了差不多一年了,所以之前爬取的内容也就没做处理,但是对爬虫做了该进,该进后的爬虫可以爬取到具有样式的内容了,以下为参考代码。
# -*- coding: UTF-8 -*-
import datetime
import requests
import json
from bs4 import BeautifulSoup
import mysql.connector
import urllib
import time
import importlib
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
url = 'https://www.jianshu.com/c/f489ec955505'
#构造请求头
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'accept': 'application/json',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
}
params = {
'order_by': 'added_at',
'page': 1
}
allList = []
def getList():
for it in range(0, 1):
params['page'] += 1
time.sleep(1)
strhtml = requests.get(url, params=params, headers=headers)
content = json.loads(strhtml.text)
allList.extend(content['notes'])
def insertPage():
list = []
for item in allList:
result = {
'slug': item['slug'],
'title': item['title']
}
list.append(result)
itemList = []
for item2 in list:
time.sleep(1)
html = requests.get('https://www.jianshu.com/p/' + item2['slug'],headers=headers)
jsonhtml = html.text
soup = BeautifulSoup(jsonhtml, 'html.parser')
h1 = soup.find(name ='h1').text
artitleTemp = soup.find(name ='article')
print("content---------", artitleTemp)
if artitleTemp is not None:
# 获取内容后一定要用str方法转成字符串,否则数据库会报错提示内容不匹配
artitle = str(artitleTemp)
result2 = {
'title': h1,
'content': artitle
}
itemList.append(result2)
print("开始连接数据库")
conn = mysql.connector.connect(host='localhost', user='root', password='troot',database='test', buffered=True)
cursor = conn.cursor()
print("数据库连接成功")
for sql in itemList:
title = sql['title']
sqlcontent = sql['content']
print (title)
print ('----------------------即将写入数据库文章内容----------')
time.sleep(1)
inserted = "select * from pages where title like '%s'" % title
cursor.execute(inserted)
result3 = cursor.fetchone()
print("result3------")
if result3 is None:
date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
time_stamp = int(round(time.time() * 1000))
print (time_stamp)
print (title)
if title is not None:
if sqlcontent is not None:
s = "insert into pages(title,content) values(%s,%s)"
cursor.execute(s, (title, sqlcontent))
s2 = "select id from pages where title = '%s'" % title
cursor.execute(s2)
conn.commit()
cursor.close()
a = 1
if (a > 0):
time.sleep(1)
print ('---------start task-----------------')
getList()
insertPage()
特别注意的是获取内容后如果要写入数据库一定要先用str方法转字符串才能正常写入,否则会报错。
发表评论 (审核通过后显示评论):