import sgmllib
class MyParser(sgmllib.SGMLParser):
"A simple parser class."
def parse(self, s):
"Parse the given string 's'."
self.feed(s)
self.close()
def __init__(self, verbose=0):
"Initialise an object, passing 'verbose' to the superclass."
sgmllib.SGMLParser.__init__(self, verbose)
self.hyperlinks = []
def start_a(self, attributes):
"Process a hyperlink and its 'attributes'."
for name, value in attributes:
if name == "href":
self.hyperlinks.append(value)
def get_hyperlinks(self):
"Return the list of hyperlinks."
return self.hyperlinks
import urllib, sgmllib
# Get something to work with.
f = urllib.urlopen("http://www.waylink-english.co.uk/?page=11620&pw=1)
s = f.read()
# Try and process the page.
# The class should have been defined first, remember.
myparser = MyParser()
myparser.parse(s)
# Get the hyperlinks.
print myparser.get_hyperlinks()
The code is looking for href as an attribute of <a>, but I don't see where it specifies a as the tag to use. As the tag I'm looking for (f6) has no attributes, I just want to get the contents of all f6 tags. Any help would be greatly appreciated!

New Topic/Question
Reply



MultiQuote






|