python爬取豆瓣电影并分析( 三 )

写入详情文件夹
# 写入详情文件夹 with open("json_file/detail.text", 'w', encoding="utf-8") as file: detail = "".join(detail_).replace("\n", "") file.write(detail)def get_json(data, driver, f): Html = etree.HTML(data) # 通过xpath获取响应的板块 email_details = Html.xpath('//*[@id="comments"]/div') items = [] # 循环板块 获取每一个板块里面的数据 for email_detail in email_details: # 地址页的地址 评论内容 评分呢 item = { "href": email_detail.xpath('./div/a/@href'), "detail": email_detail.xpath("./div[2]/p/span//text()"), "rate": email_detail.xpath("./div[2]/h3/span[2]/span[2]/@class") } items.append(item)循环列表获取地址网址
# 循环列表获取地址网址 爬取地址 for email in items: # 如果网址为控则跳过 if not email.get("href"): continue driver.get(email.get("href")[0]) # 如果评论用户已经注册则跳过 if driver.page_source.__contains__("该用户已经主动注销帐号"): continue print(email.get("href")) wait = WebDriverWait(driver, 10) # 隐士等待 等待数据板块出现进行下一步 wait.until( EC.presence_of_element_located((By.XPATH, '//*[@id="profile"]/div/div[2]')) ) # 获取地址 email["address"] = etree.HTML(driver.page_source).xpath('//*[@id="profile"]/div/div[2]/div[1]/div//text()') # 整体数据写入初始文件中 f.write(json.dumps(email, ensure_ascii=False) + ",\n")主程序
def main(user, pwd): """主程序""" f = open("data.json", "w", encoding="utf-8") f.write("[") url = "***ounts.douban.com/passport/login" # 驱动路径 driver_file = os.path.join(os.path.dirname(__file__), "driver_exe", "chromedriver.exe") # 声明驱动 driver = webdriver.Chrome(executable_path=driver_file) driver.get(url) driver.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[1]/ul[1]/li[2]').click() driver.find_element_by_xpath('//*[@id="username"]').send_keys(user) driver.find_element_by_xpath('//*[@id="password"]').send_keys(pwd) driver.find_element_by_xpath('//*[@id="account"]/div[2]/div[2]/div/div[2]/div[1]/div[4]/a').click() # 停一下,等待出现 time.sleep(2) # 滑动验证码 slide(driver) print("成功") type_mile = ["P", "F"] for type_ in type_mile: if type_mile == "P": index_mile = 480 else: index_mile = 180 for i in range(0, index_mile, 20): url = f"***/subject/35087699/comments?start={i}&limit=20&status={type_}&sort=new_score" print(url) driver.get(url) # 解析数据 get_json(driver.page_source, driver, f) f.write("]") driver.quit() get_file() # 所有数据获取完毕之后推出浏览器 解析数据 # 解析完毕并且创建解析之后数据的json文件 f.close()if __name__ == '__main__': user = input("请输入你的账号:") pwd = input("请输入你的密码:") main(user, pwd)仅供学习,爬虫使用需谨慎!
希望可以得到各位的一键三连,感谢各位支持!
祝大家学习python顺利!
【python爬取豆瓣电影并分析】如果有正在跟我一样的自学的朋友,需要我本篇的代码或者其他的Python学习资料可以转发此文私信我(私信发我“中国医生”)

推荐阅读