Here is the python code
import htmllib
import formatter
import pprint
class TableParser(htmllib.HTMLParser):
def __init__(self):
self.active=0
self.finished=0
self.skipping=0
self.result=[]
self.current_row=[]
self.current_data=[]
htmllib.HTMLParser.__init__(
self, formatter.NullFormatter())
def start_table(self,attributes):
if not self.finished:
self.active=1
def end_table(self):
self.active=1
self.finished=1
def start_tbody(self,attributes):
self.skipping=0
def end_tbody(self):
self.skipping=1
def start_tr(self,attributes):
if self.active and not self.skipping:
self.current_row = []
def end_tr(self):
if self.active and not self.skipping:
self.result.append(self.current_row)
def start_td(self,attributes):
if self.active and not self.skipping:
self.current_data = []
def end_td(self):
if self.active and not self.skipping:
self.current_row.append(
' '.join(self.current_data))
def handle_data(self, data):
if self.active and not self.skipping:
self.current_data.append(data)
def process(filename):
parser=TableParser()
parser.feed(open(filename).read())
parser.close()
return parser.result
def showparse(filename):
pprint.pprint(process(filename))
def _test():
return showparse('490.html')
if __name__=='__main__':
_test()
Here is the Html source
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<!-- saved from url=(0095)http://banner7bl04.uncfsu.edu:9000/PROD/bwckschd.p_disp_detail_sched?term_in=201020&crn_in=3221 -->
<HTML lang=en><HEAD><TITLE>Detailed Class Information</TITLE>
<META http-equiv=Content-Type content="text/html; charset=WINDOWS-1252">
<META http-equiv=Pragma content=no-cache name=Cache-Control>
<META http-equiv=Cache-Control content=no-cache name=Cache-Control><LINK
href="490_files/web_defaultapp.css" type=text/css rel=stylesheet><LINK
media=print href="490_files/web_defaultprint.css" type=text/css rel=stylesheet>
<META http-equiv=Content-Script-Type content=text/javascript
name=Default_Script_Language>
<script language=Javascript type=text/javascript>
<!-- Hide Javascript from older browsers
var submitcount=0;
function checkSubmit() {
if (submitcount == 0)
{
submitcount++;
return true;
}
else
{
alert("Your changes have already been submitted.");
return false;
}
}
// End script hiding -->
</SCRIPT>
<script language=Javascript type=text/javascript>
<!-- Hide Javascript from older browsers
// Function to open a window
function windowOpen(window_url) {
helpWin = window.open(window_url,'','toolbar=yes,status=no,scrollbars=yes,menubar=yes,resizable=yes,directories=no,location=no,width=350,height=400');
if (document.images) {
if (helpWin) helpWin.focus()
}
}
// End script hiding -->
</SCRIPT>
<META content="MSHTML 6.00.6000.16981" name=GENERATOR></HEAD>
<BODY>
<DIV class=headerwrapperdiv>
<DIV class=pageheaderdiv1><A class=skiplinks
onblur="window.status=''; return true"
onmouseover="window.status='Go to Main Content'; return true"
onfocus="window.status='Go to Main Content'; return true"
onmouseout="window.status=''; return true"
href="http://banner7bl04.uncfsu.edu:9000/PROD/bwckschd.p_disp_detail_sched?term_in=201020&crn_in=3221#main_content">Go
to Main Content</A>
<H1>FAYETTEVILLE STATE UNIVERSITY</H1></DIV>
<DIV class=headerlinksdiv></DIV>
<TABLE class=plaintable width="100%"
summary="This table displays Menu Items and Banner Search textbox.">
<TBODY>
<TR>
<TD class=pldefault>
<DIV class=headerlinksdiv2>
<FORM action=/PROD/twbksrch.P_ShowResults method=post>Search <SPAN
class=fieldlabeltextinvisible><LABEL for=keyword_in_id><SPAN
class=fieldlabeltext>Search</SPAN></LABEL></SPAN> <INPUT id=keyword_in_id
maxLength=65 name=KEYWRD_IN> <INPUT type=submit value=Go> </FORM></DIV></TD>
<TD class=pldefault>
<P class=rightaligntext><SPAN class=pageheaderlinks><A
class=submenulinktext2 accessKey=2
href="http://banner7bl04.uncfsu.edu:9000/PROD/twbksite.P_DispSiteMap?menu_name_in=bmenu.P_MainMnu&depth_in=2&columns_in=3">SITE
MAP</A> | <A class=submenulinktext2 onblur="window.status=''; return true"
onmouseover="window.status=''; return true"
onfocus="window.status=''; return true" accessKey=H
onclick="popup = window.open('/wtlhelp/twbhhelp.htm', 'PopupPage','height=450,width=500,scrollbars=yes,resizable=yes'); return false"
onmouseout="window.status=''; return true"
href="http://banner7bl04.uncfsu.edu:9000/wtlhelp/twbhhelp.htm"
target=_blank>HELP</A> | <A class=submenulinktext2 accessKey=3
href="http://banner7bl04.uncfsu.edu:9000/PROD/twbkwbis.P_Logout">EXIT</A>
</SPAN></P></TD></TR></TBODY></TABLE></DIV>
<DIV class=pagetitlediv>
<TABLE class=plaintable width="100%"
summary="This table displays title and static header displays.">
<TBODY>
<TR>
<TD class=pldefault>
<H2>Detailed Class Information</H2></TD>
<TD class=pldefault> </TD>
<TD class=pldefault>
<P class=rightaligntext>
<DIV class=staticheaders>Spring Semester 2010<BR>Mar 25,
2010<BR></DIV></TD></TR>
<TR>
<TD class=bg3 width="100%" colSpan=3><IMG title="Transparent Image"
height=3 alt="Transparent Image" hspace=0
src="490_files/web_transparent.gif" width=10 border=0
name=web_transparent></TD></TR></TBODY></TABLE><A name=main_content></A></DIV>
<DIV class=pagebodydiv><!-- ** END OF twbkwbis.P_OpenDoc ** -->
<TABLE class=datadisplaytable width="100%"
summary="This table is used to present the detailed class information.">
<CAPTION class=captiontext>Detailed Class Information</CAPTION>
<TBODY>
<TR>
<TH class=ddlabel scope=row>Senior Project - 3221 - CSC 490 -
01<BR><BR></TH></TR>
<TR>
<TD class=dddefault><SPAN class=fieldlabeltext>Associated Term:
</SPAN>Spring Semester 2010 <BR><SPAN class=fieldlabeltext>Levels:
</SPAN>Undergraduate <BR><BR>Main Campus <BR>Lecture Schedule Type
<BR>Classroom Instructional Method <BR>3.000 Credits <BR><A
href="http://banner7bl04.uncfsu.edu:9000/PROD/bwckctlg.p_display_courses?term_in=201020&one_subj=CSC&sel_crse_strt=490&sel_crse_end=490&sel_subj=&sel_levl=&sel_schd=&sel_coll=&sel_divs=&sel_dept=&sel_attr=">View
Catalog Entry</A> <BR><BR><BR>
<TABLE class=datadisplaytable width="100%"
summary="This layout table is used to present the seating numbers.">
<CAPTION class=captiontext>Registration Availability</CAPTION>
<TBODY>
<TR>
<TD class=dddead> </TD>
<TH class=ddheader scope=col><SPAN
class=fieldlabeltext>Capacity</SPAN></TH>
<TH class=ddheader scope=col><SPAN
class=fieldlabeltext>Actual</SPAN></TH>
<TH class=ddheader scope=col><SPAN
class=fieldlabeltext>Remaining</SPAN></TH>
<TR>
<TH class=ddlabel scope=row><SPAN
class=fieldlabeltext>Seats</SPAN></TH>
<TD class=dddefault>15</TD>
<TD class=dddefault>16</TD>
<TD class=dddefault>-1</TD></TR>
<TR>
<TH class=ddlabel scope=row><SPAN class=fieldlabeltext>Waitlist
Seats</SPAN></TH>
<TD class=dddefault>0</TD>
<TD class=dddefault>0</TD>
<TD
class=dddefault>0</TD></TR></TBODY></TABLE><BR><BR></TD></TR></TBODY></TABLE><BR>
<TABLE class=datadisplaytable width="50%"
summary="This is for formatting of the bottom links.">
<TBODY>
<TR>
<TD class=ntdefault><A onblur="window.status=''; return true"
onmouseover="window.status='Return to Previous'; return true"
onfocus="window.status='Return to Previous'; return true"
onmouseout="window.status=''; return true"
href="javascript:history.go(-1)">Return to Previous</A> </TD>
<TD class=ntdefault><A
href="http://banner7bl04.uncfsu.edu:9000/PROD/bwckschd.p_disp_dyn_sched">New
Search</A></TD></TR></TBODY></TABLE><!-- ** START OF twbkwbis.P_CloseDoc ** -->
<TABLE class=plaintable cellSpacing=0 cellPadding=0 width="100%"
summary="This is table displays line separator at end of the page." border=0>
<TBODY>
<TR>
<TD class=bgtabon width="100%" colSpan=2><IMG title="Transparent Image"
height=3 alt="Transparent Image" hspace=0
src="490_files/web_transparent.gif" width=10 border=0
name=web_transparent></TD></TR></TBODY></TABLE><A class=skiplinks
onblur="window.status=''; return true"
onmouseover="window.status='Skip to top of page'; return true"
onfocus="window.status='Skip to top of page'; return true"
onmouseout="window.status=''; return true"
href="http://banner7bl04.uncfsu.edu:9000/PROD/bwckschd.p_disp_detail_sched?term_in=201020&crn_in=3221#top">Skip
to top of page</A> </DIV>
<DIV class=pagefooterdiv><SPAN class=releasetext>Release: 8.0</SPAN> </DIV>
<DIV class=poweredbydiv><A href="http://www.sct.com/"><IMG
title="Powered by Sungard" height=24 alt="Powered by Sungard" hspace=0
src="490_files/web_powered_by.gif" width=187 border=0 name=web_powered_by></A>
</DIV>
<DIV class=div1></DIV>
<DIV class=div2></DIV>
<DIV class=div3></DIV>
<DIV class=div4></DIV>
<DIV class=div5></DIV>
<DIV class=div6></DIV></BODY></HTML>
There output i receive is
[['\n \n Search Search ', '\n SITE \n MAP [2] | HELP [3] | EXIT [4] \n '], ['\n Detailed Class Information', '\xa0 ', '\n \n Spring Semester 2010 Mar 25, \n 2010'], ['Transparent Image'], [], ['15', '16', '-1'], ['0', '0', '0'], ['0', '0', '0'], ['Return to Previous [6] ', 'New \n Search [7]'], ['Transparent Image']]
The out put i am looking to receive is
['15', '16', '-1'], ['0', '0', '0'],
I would appreciate help, thanks!

New Topic/Question
Reply




MultiQuote


|