mywsba_org_step1
import scrapy
import csv
from bs4 import BeautifulSoup
class MySpider(scrapy.Spider):
name = 'er'
start_urls = [
'https://www.mywsba.org/PersonifyEbusiness/LegalDirectory.aspx?ShowSearchResults=TRUE&AreaOfPractice=Family'
'https://www.mywsba.org/PersonifyEbusiness/LegalDirectory.aspx?ShowSearchResults=TRUE&AreaOfPractice=Family&Page=1'
]
page=0
while(page<158):
page=page+1
start_urls.append("https://www.mywsba.org/PersonifyEbusiness/LegalDirectory.aspx?ShowSearchResults=TRUE&AreaOfPractice=Family"+"&Page="+str(page))
def parse(self, response):
#tital=response.xpath('//*[@id="dnn_ctr2972_DNNWebControlContainer_ctl00_dg"]/tbody/tr[9]').extract()
tital=response.xpath('//*[@id="content"]/div[1]/div').extract()
soup =BeautifulSoup( (str(tital)), 'lxml')
vv=-1
c=[]
while vv<19 :
vv=vv+1
a=soup.find_all('tr',class_='grid-row')[vv]
a=str(a)
#d=(a.get_text()).strip()
#a=a[:-140]
a=a[0:120]
a=a[65:102]
c.append(a)
#Removechar=["'","""'">""","\r","\n","\t","/","\\","LegalDirectory","<td>","<tr>"]
#for char in Removechar:
# a=a.replace(char,"")
# a=a.replace("'", "")
print(c)
rr=-1
while rr <19 :
rr=rr+1
d=c[rr]
yield {'titaltext':d}
next_page = ""
if next_page is not None:
yield response.follow(next_page, self.parse)
mywsbaorgstepemailscarp_step2
import scrapy
import csv
from bs4 import BeautifulSoup
class MySpider(scrapy.Spider):
name = 'email'
allowed_domains = ['www.mywsba.org']
start_urls = [
'https://www.mywsba.org/PersonifyEbusiness/LegalDirectory/LegalProfile.aspx?Usr_ID=000009148365'
]
a="https://www.mywsba.org/PersonifyEbusiness/LegalDirectory/"
with open("link.csv", encoding='utf8') as file:
#objects=[]
for line in file:
start_urls.append(a+str(line)[:-1])
def parse(self, response):
#tital=response.xpath('//*[@id="dnn_ctr2972_DNNWebControlContainer_ctl00_dg"]/tbody/tr[9]').extract()
tital=response.xpath('//*[@id="dnn_ctr2977_DNNWebControlContainer_ctl00_lblEmail"]/a/span/text()').extract()
#url=response.get_url
yield {'email':tital}
next_page = ""
if next_page is not None:
yield response.follow(next_page, self.parse)
No comments:
Post a Comment