41 lines
1.8 KiB
Python
41 lines
1.8 KiB
Python
import logging
|
||
from utils.WebCrawler import WebCrawler
|
||
|
||
logging.basicConfig(level=logging.INFO)
|
||
|
||
if __name__ == '__main__':
|
||
# 保存地址,分别保存html源文件、处理后text文件
|
||
html_dir = "data/html/tmp_csdn_122513786_html.jsonl"
|
||
text_dir = "data/text/tmp_csdn_122513786_text.jsonl"
|
||
# 下载网页数据
|
||
# https://www.langchain.asia/
|
||
# https://blog.csdn.net/weixin_43791511/article/details/122513786
|
||
# https://zhuanlan.zhihu.com/p/645400277
|
||
# https://www.aliyun.com/?utm_content=se_1014243503
|
||
# 'https://cloud.tencent.com/developer/article/1004500?from=15425'
|
||
base_url = 'https://www.langchain.asia/'
|
||
# 爬取方式:
|
||
## requests和selenium两种方式;requests为简单请求静态网址html内容,js动态数据无法获取;
|
||
## selenium为模拟人行为请求,可获取全部html数据,但请求时间较长10-20s单网页,尽量设置5s以上的time_sleep。
|
||
reptile_lib = "requests"
|
||
method = "get" # 目前只支持get请求
|
||
time_sleep = 4 # 每两次请求间隔时间s
|
||
wc = WebCrawler()
|
||
# 爬取base_url单网址
|
||
wc.webcrawler_single(html_dir=html_dir,
|
||
text_dir=text_dir,
|
||
base_url=base_url,
|
||
reptile_lib=reptile_lib,
|
||
method=method,
|
||
time_sleep=time_sleep
|
||
)
|
||
|
||
# # 爬取base_url页面所有网址,限制target_url_prefix为前缀,默认target_url_prefix=base_url
|
||
# wc.webcrawler_1_degree(html_dir=html_dir,
|
||
# text_dir=text_dir,
|
||
# base_url=base_url,
|
||
# reptile_lib=reptile_lib,
|
||
# method=method,
|
||
# time_sleep=time_sleep
|
||
# )
|