Python scrapy script to scrap email from [www.mywsba.org']


 


mywsba_org_step1

import scrapy
import csv
from bs4 import BeautifulSoup
class MySpider(scrapy.Spider):

    name = 'er'

    allowed_domains = ['www.mywsba.org']

    start_urls = [

                
                'https://www.mywsba.org/PersonifyEbusiness/LegalDirectory.aspx?ShowSearchResults=TRUE&AreaOfPractice=Family'
                 'https://www.mywsba.org/PersonifyEbusiness/LegalDirectory.aspx?ShowSearchResults=TRUE&AreaOfPractice=Family&Page=1'
            ] 
    page=0
    while(page<158): 
        page=page+1
        start_urls.append("https://www.mywsba.org/PersonifyEbusiness/LegalDirectory.aspx?ShowSearchResults=TRUE&AreaOfPractice=Family"+"&Page="+str(page))    
    def parse(selfresponse):
     
        
        #tital=response.xpath('//*[@id="dnn_ctr2972_DNNWebControlContainer_ctl00_dg"]/tbody/tr[9]').extract()
        
        tital=response.xpath('//*[@id="content"]/div[1]/div').extract()
        soup =BeautifulSoup( (str(tital)), 'lxml')
        vv=-1
        c=[]
        while vv<19 :
         vv=vv+1
         a=soup.find_all('tr',class_='grid-row')[vv]
         a=str(a)
         
        
         #d=(a.get_text()).strip()
         #a=a[:-140]
         a=a[0:120]
         a=a[65:102]
         c.append(a)

        #Removechar=["'","""'">""","\r","\n","\t","/","\\","LegalDirectory","<td>","<tr>"]
        #for char in Removechar:
        # a=a.replace(char,"")
        # a=a.replace("'", "")
        print(c)
        rr=-1
        while rr <19 :
            rr=rr+1
            d=c[rr]
            yield {'titaltext':d}

        next_page = ""
        if next_page is not None:
           yield response.follow(next_page, self.parse)

mywsbaorgstepemailscarp_step2

import scrapy
import csv
from bs4 import BeautifulSoup
class MySpider(scrapy.Spider):

    name = 'email'

    allowed_domains = ['www.mywsba.org']

    start_urls = [

                
                'https://www.mywsba.org/PersonifyEbusiness/LegalDirectory/LegalProfile.aspx?Usr_ID=000009148365'
            ] 
    a="https://www.mywsba.org/PersonifyEbusiness/LegalDirectory/"
    with open("link.csv"encoding='utf8'as file:
                #objects=[]
                for line in file:
                  start_urls.append(a+str(line)[:-1])
    def parse(selfresponse):
     
        
        #tital=response.xpath('//*[@id="dnn_ctr2972_DNNWebControlContainer_ctl00_dg"]/tbody/tr[9]').extract()
        
        tital=response.xpath('//*[@id="dnn_ctr2977_DNNWebControlContainer_ctl00_lblEmail"]/a/span/text()').extract()
        #url=response.get_url
        yield {'email':tital}

        next_page = ""
        if next_page is not None:
           yield response.follow(next_page, self.parse)

No comments:

Post a Comment