提取的数据不存储在scrapy项目中
我正在使用scrapy和selen在python中构build一个屏幕刮板的项目,然后使用xlsxwriter将其输出到一个excel文件中。 但是,似乎我的scrapy项目总是空着。 我不确定在这一点发生了什么,所以任何帮助表示赞赏。 只是要注意,在这里张贴之前,有几个url,目录和其他一些敏感信息。 所以一些链接和目录可能看起来很奇怪。
excel输出的屏幕截图: 屏幕刮板文件输出
蜘蛛:
import os import time from datetime import date from ScreenScraper.items import * from scrapy import * from scrapy.http import FormRequest from scrapy.loader import ItemLoader from scrapy.selector import Selector from selenium import webdriver from selenium.webdriver.common.keys import Keys from xlsxwriter import * class CquentiaSpider(Spider): name = 'cquentia' allowed_domains = ['linktotable.com'] start_urls = ['www.linktotable.com/login'] #had to strip directories ,urls, usernames, and passwords from script. login_user = 'Example' login_pass = 'Example' accnt_ID_1 = 'Example' phantomjspath = r'C:\Users\[User]\Documents\Visual Studio 2015\Projects\ScreenScraper\Scraper\selenium\webdriver\phantomjs\bin\phantomjs.exe' def __init__(self, name = None, **kwargs): self.browser = webdriver.PhantomJS(executable_path=self.phantomjspath) return super(CquentiaSpider, self).__init__(name, **kwargs) def parse(self, response): self.browser.get(response.url) username = self.browser.find_element_by_name('username') password = self.browser.find_element_by_name('password') login = self.browser.find_element_by_name('submit') username.send_keys(self.login_user) password.send_keys(self.login_pass) login.click() time.sleep(1.5) self.browser.get('www.linktotable.com/search') accnt_id = self.browser.find_element_by_name('accnId') search = self.browser.find_element_by_name('accnSrch') accnt_id.send_keys(self.accnt_ID_1) search.click() time.sleep(1.5) select = Selector(text=self.browser.page_source) get_table_count = count() get_table_count['row_count'] = select.xpath('//*[@id="otTable"]/thead/tr/td[1]/text()').extract() [0] count_final = int(get_table_count['row_count']) + 1 patient = cquentiaPatientItems() patient['Pt_First_Name'] = select.xpath('//*[@id="ptFNm"]/text()').extract() patient['Pt_Last_Name'] = select.xpath('//*[@id="ptLNm"]/text()').extract() patient['Client_ID_Name'] = select.xpath('//*[@id="clnNm"]/text()').extract() patient['DOS'] = select.xpath('//*[@id="dos"]/text()').extract() patient_First_Name = str(patient['Pt_First_Name']) patient_Last_Name = str(patient['Pt_Last_Name']) patient_Client_ID = str(patient['Client_ID_Name']) patient_DOS = str(patient['DOS']) header_layout = ['First Name:', 'Last Name:', 'Client Name:', 'DOS:'] header_data = [patient_First_Name, patient_Last_Name, patient_Client_ID, patient_DOS] table_header_layout = ['Test ID', 'Name', 'Mod 1', 'Mod 2', 'Mod 3', 'Mod 4', 'Proc Code', 'Name','Units Billed $', 'Billed $', 'Gross $', 'Expect $', 'Price Method', 'Payor ID', 'POS', 'Rendering Phys'] workbook = Workbook('%s_spider.xlsx' % (self.name)) worksheet = workbook.add_worksheet() row = 0 col = 0 for value in header_layout: worksheet.write(row, 0, value) row = 1 + row row = 0 for value in header_data: worksheet.write(row, 1, value) row = 1 + row row = 6 for value in table_header_layout: worksheet.write(row, col, value) col = 1 + col col = 0 row = 7 for number in range(0, count_final): table = Table() table['Test_ID'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[2]/text()' % (number)).extract() table['Name_1'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[3]/text()' % (number)).extract() table['Name_2'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[9]/text()' % (number)).extract() table['Mod_1'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[4]/text()' % (number)).extract() table['Mod_2'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[5]/text()' % (number)).extract() table['Mod_3'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[6]/text()' % (number)).extract() table['Mod_4'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[7]/text()' % (number)).extract() table['Proc_Code'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[8]/text()' % (number)).extract() table['Units_Billed'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[10]/text()' % (number)).extract() table['Billed'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[11]/text()' % (number)).extract() table['Gross'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[12]/text()' % (number)).extract() table['Expect'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[13]/text()' % (number)).extract() table['Price_Method'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[15]/text()' % (number)).extract() table['Payor_ID'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[16]/text()' % (number)).extract() table['POS'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[22]/text()' % (number)).extract() table['Rendering_Phys'] = select.xpath('//*[@id="otTable"]/tbody/tr[%s]/td[23]/text()' % (number)).extract() table_Test_ID = str(table['Test_ID']) table_Name_1 = str(table['Name_1']) table_Name_2 = str(table['Name_2']) table_Mod_1 = str(table['Mod_1']) table_Mod_2 = str(table['Mod_2']) table_Mod_3 = str(table['Mod_3']) table_Mod_4 = str(table['Mod_4']) table_Proc_Code = str(table['Proc_Code']) table_Units_Billed = str(table['Units_Billed']) table_Billed = str(table['Billed']) table_Gross = str(table['Gross']) table_Expect = str(table['Expect']) table_Price_Method = str(table['Price_Method']) table_Payor_ID = str(table['Payor_ID']) table_POS = str(table['POS']) table_Rendering_Phys = str(table['Rendering_Phys']) table_data = [table_Test_ID, table_Name_1, table_Mod_1, table_Mod_2, table_Mod_3, table_Mod_4, table_Proc_Code, table_Name_2, table_Units_Billed, table_Billed, table_Gross, table_Expect, table_Price_Method, table_Payor_ID, table_POS, table_Rendering_Phys] for text in table_data: worksheet.write(row, col, text) col = 1 + col row = 1 + row col = 0 workbook.close()
项目:
from scrapy import * class count(Item): row_count = Field() class Table(Item): Test_ID = Field(serializer=str) Name_1 = Field(serializer=str) Name_2 = Field(serializer=str) Mod_1 = Field(serializer=str) Mod_2 = Field(serializer=str) Mod_3 = Field(serializer=str) Mod_4 = Field(serializer=str) Proc_Code = Field(serializer=int) Units_Billed = Field(serializer=str) Billed = Field(serializer=str) Gross = Field(serializer=str) Expect = Field(serializer=str) Price_Method = Field(serializer=str) Payor_ID = Field(serializer=str) POS = Field(serializer=str) Rendering_Phys = Field(serializer=str) class cquentiaPatientItems(Item): Pt_First_Name = Field(serializer=str) Pt_Last_Name = Field(serializer=str) Client_ID_Name = Field(serializer=str) DOS = Field()