Click here to Skip to main content
15,886,199 members
Please Sign up or sign in to vote.
0.00/5 (No votes)
I'm trying to scrape a website (https://ae.indeed.com/?r=us). Part of the code is to retrieve the job descriptions, but they're under div tags with different id' on different pages. My code works for all id' except for this:

<div id="jobDescriptionText" class="jobsearch-jobDescriptionText">
 <p>Text</p>
 <ul>
  <li>Text</li>
 </ul>
</div>



I keep getting this error:
Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id=

My code is:

from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from random import randint
from time import sleep
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import openpyxl
import re

#markers
filename1="markers.txt"
file1=open('C:/Users/USER/Desktop/python/indeed.com/markers.txt')
themarkerfile=file1.readlines()
markerfile=list(marker.rstrip('\n') for marker in themarkerfile)

#Excel file1
wb = openpyxl.Workbook()
sheet = wb.active
#Setting data headers
a1= sheet.cell(row = 1, column = 1)
a1.value = 'Searched Job Title'
b1= sheet.cell(row = 1, column = 2)
b1.value = 'Job Title'
c1= sheet.cell(row = 1, column = 3)
c1.value = 'Company'
d1= sheet.cell(row = 1, column = 4)
d1.value= 'Salary'
e1= sheet.cell(row = 1, column = 5)
e1.value= "Rating"
f1=sheet.cell(row = 1, column = 6)
f1.value = 'Marker'
g1=sheet.cell(row = 1, column = 7)
g1.value='Description'

# Start from the first cell below the headers.
row = 2
column = 2 
the_listing = 1 #to keep track of which job listing it is iterating through

info=[]

#getting url
def get_url(position, location):
        template = 'https://ae.indeed.com/jobs?q={}&l={}'
        url = template.format(position, location)
        return url


position = 'receptionist'
location='dubai'
url=get_url(position, location)
print ('URL created:', url)


driver = webdriver.Chrome("C:/Users/USER/Desktop/python/chromedriver_win32/chromedriver.exe")
driver.get(url)
driver.implicitly_wait(10)


def retrieving():
#retrieving basic info

 try:
  for i in range(14):
       postings=driver.find_elements_by_class_name('result')

 except:
  print('Error in retrieving postings')


 counts=0 #to get the right rating and description?
 rate=[]

 job=1

 for job in postings:

   try:
        result_html = job.get_attribute('innerHTML')
        soup = BeautifulSoup(result_html, 'html.parser')


   except:
        print('Error in retreiving job from postings')

   try: #retrieving ratings (not sure if this works but is irrelevant to this post
)
        list_html_ratings = soup.find_all('span', attrs={'aria-hidden': True})
        list_ratings = [x.text for x in list_html_ratings]
        rate.append(list_ratings)

   except:
        list_ratings = 'None'
        rate.append(list_ratings)

   sleep(randint(10,15))

   while True:
   #retreiving description 
    try:
      job.click()

      try: #works
       driver.implicitly_wait(7)
       description0=(driver.find_element_by_id('vjs-content').text)
       counts=counts+1
       break
      except Exception as e:
       print(e)
       
      try:#works
       #job.click()
       driver.implicitly_wait(7)
       description0=(driver.find_element_by_id('vjs-desc').text)
       counts=counts+1
       break
      except Exception as e:
       print(e)

      try:
       driver.implicitly_wait(7)
       description0=driver.find_element_by_xpath('''//*[@id="jobDescriptionText"]''').getText()
       print(description0)
       counts=counts+1
       break
      except Exception as e:
       print('jobDescriptionText doesnt work')

      try: #trying to work with the iframe
       driver.implicitly_wait(7)
       if (len(driver.finds_element_by_xpath('''//*[@id="jobDescriptionText"]'''))==0):
       # get the number of iframes
        iframes = driver.find_elements_by_tag_name("iframe")
        # iterate through all iframes to find out which iframe the required element
        for iFrameNumber in iframes:
        # switching to iframe (based on counter)
         driver.switch_to.frame(iFrameNumber+1)
         # check if the element present in the iframe
         if len(driver.finds_element_by_xpath('''//*[@id="jobDescriptionText"]''')) > 0:
            print("found element in iframe :" + str(iFrameNumber+1))
            description0=driver.find_element_by_id('jobDescriptionText').getText()
            driver.switch_to.default_content()
            print (str('.getText() iframe',description0))
            counts=counts+1
      except Exception as e:
        print(e)
        print('above error was caused by  iframe')

      try: #trying to switch windows
         
         the_link=[soup.find_all ('a')]
         desc_page=str(the_link.get['href']) 
         window_before = driver.window_handles[0]
         desc_page_1=driver.get(desc_page[0])
         window_after = driver.window_handles[1]
         driver.switch_to_window(window_after)
         driver.implicitly_wait(10)
         description0=driver.find_element_by_xpath('''//*[@id="jobDescriptionText"]''').getText()
         print(description0)
         driver.switch_to_window(window_before)
         counts=counts+1
         break
      except Exception as e:
         print('not jobDescriptionText in new window or it doesnt work')
         print(e)
    except:
      print ('Error in retrieving desc')
      break

      
   for mark in markerfile: #actually getting the stuff
             if mark in description0.split() or mark.upper() in description0.split() or mark.lower() in description0.split():
                desc=(description0.replace("\n", ""))
                #getting job title
                try:
                    job_title=(driver.find_element_by_id('vjs-jobtitle').text)
                except:
                    job_title='None'

                #getting company name
                try:
                    company=(driver.find_element_by_id('vjs-cn').text)
                except:
                    company='None'

                #getting salary
                try:
                    salary=driver.find_element_by_xpath("""//*[@id="vjs-jobinfo"]/div[3]/span""").text
                except:
                    salary="None"

                #getting company rating
                try:
                    count=counts-1
                    rating=rate[counts]
                except:
                    rating='None'

                info.append(tuple((job_title, company,salary, rating, mark, desc)))


def appending():
#adding the retrieved details from info to the excel file
 global row
 global column
 try:

   for job_title, company,salary, rating, mark, desc in info:  #adding 'The_' to indicate column

       try:
        The_job_title= sheet.cell(row, column)
        The_job_title.value = job_title
       except Exception as e:
         print('error in appending title.')
       try:
        The_company=sheet.cell(row, column+1)
        The_company.value= company
       except Exception as e:
        print('error in appending company.')

       try:
        The_salary= sheet.cell(row, column+2)
        The_salary.value= salary
       except Exception as e:
        pass

       try:
        The_rating=sheet.cell(row, column+3)
        The_rating.value=rating
       except Exception as e:
        pass

       try:
        The_mark=sheet.cell(row, column+4)
        The_mark.value=mark
       except Exception as e:
        print('error in appending mark.')
       try:
        The_desc=sheet.cell(row, column+5)
        The_desc.value=desc
       except Exception as e:
        print('error in appending desc.')

       try:
        wb.save("C:\\Users\\user\\Desktop\\python\\indeed.com\\Indeed_data_ONE.xlsx")
       except Exception as e:
        print('error in saving file.')

       try:
        row= row+1
       except Exception as e:
         print("Can't change row")
 except:
          print('Error in committing data to excel')

#where the actual magic happens
countent=0
page=10 #page 2 has 10 in url
for i in range (9):

     if countent==0:
      retrieving()
      appending()
      the_listing=the_listing+1
      sleep(randint(10,15))

     #going to the next page
     next_page='https://ae.indeed.com/jobs?q={}&l={}&start={}'


     #for i in range (8):

     next_page_url=next_page.format(position, location, page)
     page=page+10

     driver = webdriver.Chrome("C:/Users/USER/Desktop/python/chromedriver_win32/chromedriver.exe")
     driver.get(next_page_url)
     driver.implicitly_wait(10)

     print('Page number', the_listing ,'accessed')

     #closing pop up
     try:
        close_popup=driver.find_element_by_xpath("""//*[@id="popover-x"]/button""")
        close_popup.click()
        print('POP UP REMOVAL SUCCESS')
     except:
        print('POP UP REMOVAL FAILURE OR NO POP UP')

     if countent >4:
       sleep(randint(10,15))

     retrieving()
     appending()
     the_listing=the_listing+1

     if countent >4:
      sleep(randint(10,15))

     countent=0+1


#driver.quit()


What I have tried:

I have also tried to use .text .getText() and .getattribute() with innerText and innerHtml but that doesn't work too.

It's under an iframe tag and I read somewhere that it can affect it? I've tried to account for it but it's still not working. I've also tried to open the description in another window to see if it can make it easier to scrape but that also doesn't work for some reason.
Posted

This content, along with any associated source code and files, is licensed under The Code Project Open License (CPOL)



CodeProject, 20 Bay Street, 11th Floor Toronto, Ontario, Canada M5J 2N8 +1 (416) 849-8900