虎扑步行街晒图模块爬虫
一、目标
爬取下载步行街晒图模块所有图片。
虎扑地址
https://bbs.hupu.com/selfie
截图
image.png
image.png
二、代码
1.定义请求函数
设置超时时间10S,如果需要设置socks代理
async function get(url, options = {}, config = {}) {
const { socks } = config;
await wait();
const source = axios.CancelToken.source();
return new Promise((resolve, reject) => {
setTimeout(() => {
source.cancel("请求超时.....");
reject();
}, 10000);
axios
.get(url, {
httpsAgent: socks
? new SocksProxyAgent(`socks://127.0.0.1:${socksPort}`)
: undefined,
headers: { "user-agent": ua },
timeout: 10000,
cancelToken: source.token,
...options,
})
.then(resolve)
.catch(reject);
});
}
async function getImage(url) {
return await get(url, { responseType: "arraybuffer" });
}
2.定义HTML解析函数
注意中文解析
function jquery(html) {
return cheerio.load(html, { decodeEntities: false });
}
3.请求网页,解析网页代码,获取图片地址,下载到本地
(async () => {
try {
for (let i = 1; i < 2; i++) {
try {
const html = (await get(`https://bbs.hupu.com/selfie-${i}`))
.data;
const $ = jquery(html);
const list = $("html").find(".for-list >li");
for (let j = 0; j < list.length; j++) {
try {
const name = $(list[j]).find("a").text();
const link = $(list[j]).find("a").attr("href");
console.log(name, link);
if (name && link) {
const detail = (
await get(`https://bbs.hupu.com${link}`)
).data;
const $1 = jquery(detail);
const imgs = $1("html").find("p > img");
console.log("图片长度:", imgs.length);
if (imgs.length) {
await execAsync(
`cd imgs && mkdir ${md5(name)}`
).catch(() => {});
}
for (let k = 0; k < imgs.length; k++) {
try {
const imgLink = $1(imgs[k]).attr("src");
if (imgLink) {
const imgData = (
await getImage(imgLink)
).data;
await writeFileAsync(
path.resolve(
__dirname,
`./imgs/${md5(name)}/${md5(
imgLink
)}.jpg`
),
imgData
);
console.log("创建成功");
}
} catch (e) {
console.log(e);
}
}
}
} catch (e) {
console.log(e);
}
}
} catch (e) {
console.log(e);
}
}
} catch (e) {
console.log(e);
}
})();
三、运行截图
image.png
四、代码地址
https://github.com/ThreesomeWiki/hupu
发表评论 (审核通过后显示评论):