爬取币世界标红快讯内容(移动版)
# 引入依赖from lxml import etreeimport requestsimport pymongoimport timeclient = pymongo.MongoClient('写你自己的数据库地址', 27017) # 需要自己安装mongodb客户端mydb = client['mydb']information = mydb['information'] # 数据库表名currentTime = time.strftime("%m%d%H", time.localtime())saveTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())# 伪造成手机header = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'}def get_url(url): html = requests.get(url, headers=header) selector = etree.HTML(html.text) infos = selector.xpath('//div[@id="kuaixun_list"]/div/article/section[@class="focus"]') onlyOne = selector.xpath('//div[@id="kuaixun_list"]/div/article/section[@class="focus"]')[0] saveId = onlyOne.xpath('../@id')[0] file = open(r'C:/Users/SCZ/PycharmProjects/CommunityCrawl/newest', 'w') # 写你自己的文件地址 file.write(currentTime +' '+saveId) file.close() for info in infos: try: title = (info.xpath('h3[@class="text_title"]/text()')[0]).strip() content = (info.xpath('p[@class="text_show"]/text()')[0]).strip() date = info.xpath('../h3[@class="timenode"]/text()')[0] infoId = info.xpath('../@id')[0] data = { 'title': title, 'id': infoId, 'date': saveTime, 'content': content, 'source': 'bishijie' } print(data) if (int(infoId) > int(saveId) - 20): print('插入了一条新数据!') information.insert_one(data) else: print('无新数据产生!') except IndexError: passif __name__ == '__main__': fs = open('C:/Users/SCZ/PycharmProjects/CommunityCrawl/newest', 'r+') # 写你自己的文件地址 line = fs.read() fileDate = line[0:6] if (fileDate != currentTime): print('时间不一致,宕机使用当前系统时间进行爬取!') urls = ['http://m.bishijie.com/kuaixun?fm=' + currentTime] for url in urls: get_url(url) time.sleep(2) else: print('时间一致, 正常运行!') urls = ['http://m.bishijie.com/kuaixun?fm=' + currentTime] for url in urls: get_url(url) time.sleep(2)
主要要求掌握内容: xpath语法,python操作文件,python的基础语法
本文内容比较基础,写的不好,多多指教!大家一起进步!!!
我的其他关于python的文章