I'm trying to scrape a website (https://ae.indeed.com/?r=us). Part of the code is to retrieve the job descriptions, but they're under div tags with different id' on different pages. My code works for all id' except for this:
<div id="jobDescriptionText" class="jobsearch-jobDescriptionText">
<p>Text</p>
<ul>
<li>Text</li>
</ul>
</div>
I keep getting this error:
Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id=
My code is:
from selenium import webdriver
import requests
from bs4 import BeautifulSoup
from random import randint
from time import sleep
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import openpyxl
import re
#markers
filename1="markers.txt"
file1=open('C:/Users/USER/Desktop/python/indeed.com/markers.txt')
themarkerfile=file1.readlines()
markerfile=list(marker.rstrip('\n') for marker in themarkerfile)
#Excel file1
wb = openpyxl.Workbook()
sheet = wb.active
#Setting data headers
a1= sheet.cell(row = 1, column = 1)
a1.value = 'Searched Job Title'
b1= sheet.cell(row = 1, column = 2)
b1.value = 'Job Title'
c1= sheet.cell(row = 1, column = 3)
c1.value = 'Company'
d1= sheet.cell(row = 1, column = 4)
d1.value= 'Salary'
e1= sheet.cell(row = 1, column = 5)
e1.value= "Rating"
f1=sheet.cell(row = 1, column = 6)
f1.value = 'Marker'
g1=sheet.cell(row = 1, column = 7)
g1.value='Description'
# Start from the first cell below the headers.
row = 2
column = 2
the_listing = 1 #to keep track of which job listing it is iterating through
info=[]
#getting url
def get_url(position, location):
template = 'https://ae.indeed.com/jobs?q={}&l={}'
url = template.format(position, location)
return url
position = 'receptionist'
location='dubai'
url=get_url(position, location)
print ('URL created:', url)
driver = webdriver.Chrome("C:/Users/USER/Desktop/python/chromedriver_win32/chromedriver.exe")
driver.get(url)
driver.implicitly_wait(10)
def retrieving():
#retrieving basic info
try:
for i in range(14):
postings=driver.find_elements_by_class_name('result')
except:
print('Error in retrieving postings')
counts=0 #to get the right rating and description?
rate=[]
job=1
for job in postings:
try:
result_html = job.get_attribute('innerHTML')
soup = BeautifulSoup(result_html, 'html.parser')
except:
print('Error in retreiving job from postings')
try: #retrieving ratings (not sure if this works but is irrelevant to this post
)
list_html_ratings = soup.find_all('span', attrs={'aria-hidden': True})
list_ratings = [x.text for x in list_html_ratings]
rate.append(list_ratings)
except:
list_ratings = 'None'
rate.append(list_ratings)
sleep(randint(10,15))
while True:
#retreiving description
try:
job.click()
try: #works
driver.implicitly_wait(7)
description0=(driver.find_element_by_id('vjs-content').text)
counts=counts+1
break
except Exception as e:
print(e)
try:#works
#job.click()
driver.implicitly_wait(7)
description0=(driver.find_element_by_id('vjs-desc').text)
counts=counts+1
break
except Exception as e:
print(e)
try:
driver.implicitly_wait(7)
description0=driver.find_element_by_xpath('''//*[@id="jobDescriptionText"]''').getText()
print(description0)
counts=counts+1
break
except Exception as e:
print('jobDescriptionText doesnt work')
try: #trying to work with the iframe
driver.implicitly_wait(7)
if (len(driver.finds_element_by_xpath('''//*[@id="jobDescriptionText"]'''))==0):
# get the number of iframes
iframes = driver.find_elements_by_tag_name("iframe")
# iterate through all iframes to find out which iframe the required element
for iFrameNumber in iframes:
# switching to iframe (based on counter)
driver.switch_to.frame(iFrameNumber+1)
# check if the element present in the iframe
if len(driver.finds_element_by_xpath('''//*[@id="jobDescriptionText"]''')) > 0:
print("found element in iframe :" + str(iFrameNumber+1))
description0=driver.find_element_by_id('jobDescriptionText').getText()
driver.switch_to.default_content()
print (str('.getText() iframe',description0))
counts=counts+1
except Exception as e:
print(e)
print('above error was caused by iframe')
try: #trying to switch windows
the_link=[soup.find_all ('a')]
desc_page=str(the_link.get['href'])
window_before = driver.window_handles[0]
desc_page_1=driver.get(desc_page[0])
window_after = driver.window_handles[1]
driver.switch_to_window(window_after)
driver.implicitly_wait(10)
description0=driver.find_element_by_xpath('''//*[@id="jobDescriptionText"]''').getText()
print(description0)
driver.switch_to_window(window_before)
counts=counts+1
break
except Exception as e:
print('not jobDescriptionText in new window or it doesnt work')
print(e)
except:
print ('Error in retrieving desc')
break
for mark in markerfile: #actually getting the stuff
if mark in description0.split() or mark.upper() in description0.split() or mark.lower() in description0.split():
desc=(description0.replace("\n", ""))
#getting job title
try:
job_title=(driver.find_element_by_id('vjs-jobtitle').text)
except:
job_title='None'
#getting company name
try:
company=(driver.find_element_by_id('vjs-cn').text)
except:
company='None'
#getting salary
try:
salary=driver.find_element_by_xpath("""//*[@id="vjs-jobinfo"]/div[3]/span""").text
except:
salary="None"
#getting company rating
try:
count=counts-1
rating=rate[counts]
except:
rating='None'
info.append(tuple((job_title, company,salary, rating, mark, desc)))
def appending():
#adding the retrieved details from info to the excel file
global row
global column
try:
for job_title, company,salary, rating, mark, desc in info: #adding 'The_' to indicate column
try:
The_job_title= sheet.cell(row, column)
The_job_title.value = job_title
except Exception as e:
print('error in appending title.')
try:
The_company=sheet.cell(row, column+1)
The_company.value= company
except Exception as e:
print('error in appending company.')
try:
The_salary= sheet.cell(row, column+2)
The_salary.value= salary
except Exception as e:
pass
try:
The_rating=sheet.cell(row, column+3)
The_rating.value=rating
except Exception as e:
pass
try:
The_mark=sheet.cell(row, column+4)
The_mark.value=mark
except Exception as e:
print('error in appending mark.')
try:
The_desc=sheet.cell(row, column+5)
The_desc.value=desc
except Exception as e:
print('error in appending desc.')
try:
wb.save("C:\\Users\\user\\Desktop\\python\\indeed.com\\Indeed_data_ONE.xlsx")
except Exception as e:
print('error in saving file.')
try:
row= row+1
except Exception as e:
print("Can't change row")
except:
print('Error in committing data to excel')
#where the actual magic happens
countent=0
page=10 #page 2 has 10 in url
for i in range (9):
if countent==0:
retrieving()
appending()
the_listing=the_listing+1
sleep(randint(10,15))
#going to the next page
next_page='https://ae.indeed.com/jobs?q={}&l={}&start={}'
#for i in range (8):
next_page_url=next_page.format(position, location, page)
page=page+10
driver = webdriver.Chrome("C:/Users/USER/Desktop/python/chromedriver_win32/chromedriver.exe")
driver.get(next_page_url)
driver.implicitly_wait(10)
print('Page number', the_listing ,'accessed')
#closing pop up
try:
close_popup=driver.find_element_by_xpath("""//*[@id="popover-x"]/button""")
close_popup.click()
print('POP UP REMOVAL SUCCESS')
except:
print('POP UP REMOVAL FAILURE OR NO POP UP')
if countent >4:
sleep(randint(10,15))
retrieving()
appending()
the_listing=the_listing+1
if countent >4:
sleep(randint(10,15))
countent=0+1
#driver.quit()
What I have tried:
I have also tried to use .text .getText() and .getattribute() with innerText and innerHtml but that doesn't work too.
It's under an iframe tag and I read somewhere that it can affect it? I've tried to account for it but it's still not working. I've also tried to open the description in another window to see if it can make it easier to scrape but that also doesn't work for some reason.