The documentation I read suggested that urlopen can be slow, but I doubt rewriting this program in a threaded fashion would help.
Also, if you see anything that seems off in the programming, let me know; I'm self taught, and my work doesn't get critiqued very often.
#This contains the functions for scraping jobs section of the moffitt website
from urllib import *
from re import *
#This function takes a url and returns the html source
def getHTML(url):
urlBuffer = urlopen(url)
HTML = urlBuffer.read()
urlBuffer.close()
return HTML
#This function takes the main moffitt URL and turns it into the
#individual job URLs
#This function also strips off trailing 's
def getMoffittURLs(self):
x = findall('http://tbe.taleo.net/NA4/ats/careers/requisition.*\'',self)
x = map (lambda y : y.replace('\'',''),x)
return x
#function to determind if requirements are out of bounds
def isAble(cleanSourceCode):
#to lower
cleanSourceCode = removeTags(cleanSourceCode)
able = True
notAble = "m.d. nurse phd".split()
for i in range(len(notAble)):
if notAble[i] in cleanSourceCode.lower():
able = False
return able
#removes html tags
def removeTags(feed):
p= compile(r'<.*?>')
return p.sub('',feed)
#returns the title
def getTitle (feed):
a= findall('<td colspan=2><b>.*</b>',feed)
a=removeTags(str(a))
return a
def main ():
mUrl ='http://tbe.taleo.net/NA4/ats/careers/searchResults.jsp?org=MOFFITT&cws=1'
html = getHTML(mUrl)
links = getMoffittURLs(html)
for i in links:
able = True
job = getHTML(i)
able = isAble(job)
if able:
title = getTitle(job)
print title + " is open"
Thanks

New Topic/Question
Reply




MultiQuote





|