虎扑步行街晒图模块爬虫

一、目标 爬取下载步行街晒图模块所有图片。 虎扑地址 https://bbs.hupu.com/selfie 截图 image.png image.png 二、代码 1.定义请求函数 设置超时时间10S,如果需要设置socks代理 async function get(url, options = {}, config = {}) { const { socks } = config; await wait(); const source = axios.CancelToken.source(); return new Promise((resolve, reject) => { setTimeout(() => { source.cancel("请求超时....."); reject(); }, 10000); axios .get(url, { httpsAgent: socks ? new SocksProxyAgent(`socks://127.0.0.1:${socksPort}`) : undefined, headers: { "user-agent": ua }, timeout: 10000, cancelToken: source.token, ...options, }) .then(resolve) .catch(reject); }); } async function getImage(url) { return await get(url, { responseType: "arraybuffer" }); } 2.定义HTML解析函数 注意中文解析 function jquery(html) { return cheerio.load(html, { decodeEntities: false }); } 3.请求网页,解析网页代码,获取图片地址,下载到本地 (async () => { try { for (let i = 1; i < 2; i++) { try { const html = (await get(`https://bbs.hupu.com/selfie-${i}`)) .data; const $ = jquery(html); const list = $("html").find(".for-list >li"); for (let j = 0; j < list.length; j++) { try { const name = $(list[j]).find("a").text(); const link = $(list[j]).find("a").attr("href"); console.log(name, link); if (name && link) { const detail = ( await get(`https://bbs.hupu.com${link}`) ).data; const $1 = jquery(detail); const imgs = $1("html").find("p > img"); console.log("图片长度:", imgs.length); if (imgs.length) { await execAsync( `cd imgs && mkdir ${md5(name)}` ).catch(() => {}); } for (let k = 0; k < imgs.length; k++) { try { const imgLink = $1(imgs[k]).attr("src"); if (imgLink) { const imgData = ( await getImage(imgLink) ).data; await writeFileAsync( path.resolve( __dirname, `./imgs/${md5(name)}/${md5( imgLink )}.jpg` ), imgData ); console.log("创建成功"); } } catch (e) { console.log(e); } } } } catch (e) { console.log(e); } } } catch (e) { console.log(e); } } } catch (e) { console.log(e); } })(); 三、运行截图 image.png 四、代码地址 https://github.com/ThreesomeWiki/hupu

本文章由javascript技术分享原创和收集

发表评论 (审核通过后显示评论):