import cgitb
cgitb.enable()
import cgi
import urllib
import sys
from lxml import etree
links = []
swf_links = []
temp_links = []
## Recursion for searching links:
def find_webpage(links, maxDepth, depth):
global swf_links
global temp_links
for element1 in links:
try:
webPage = urllib.urlopen(element1)
html = webPage.read()
dom = etree.HTML(html)
for element in dom.iter("a"):
link = element.get("href")
if link != None and not link.startswith("#"):
if not link.startswith("http://") and not link.startswith("https://"):
link=str(element1) + '/' + str(link)
if not link.endswith(".swf"):
temp_links.append(link)
else:
swf_links.append(link)
#- Search for the src attribute
for element in dom.iter():
src = element.get("src")
if src != None and src.startswith("http://") and src.endswith(".swf"):
swf_links.append(src)
# Close the connection
webPage.close()
except UnicodeEncodeError:
continue
if depth <= maxDepth:
for element in temp_links:
for element2 in links:
if not (element==element2):
links.append(element)
break
temp_links=[]
find_webpage(links, maxDepth, depth+1)
links = A list of links that want to search by users, maxDepth = Input by users of how many depths of webpages they want to search
depth = 1
But I find that there are no problems for maxDepth=0 / 1, but after 1, it starts to be cannot processed the output. Why? Can you help me to solve it to search more quickly?
Mod edit - Fixed code tags

New Topic/Question
Reply




MultiQuote


|