import string, urlparse, re, sys
class place:
def create(self, i_url, i_root, i_re_neg = "", i_title = ""):
#i_re_neg = reg expression that must be false to return success = 1
#check regexp condition and existence of i_url on the same server with i_root
#in negative case it returns 0
self.relative_url = i_url
self.title = i_title
self.full_url = self.merge_url(i_url, i_root)
if self.full_url is None:
return 0
if len(i_re_neg) > 0:
#apply reg exp condition
r = re.compile(i_re_neg)
if not(r.match(self.full_url) is None):
#the i_re_neg is found
#print "found", i_re_neg, "in", self.full_url
return 0
#print "full_url:", self.full_url
self.netloc = urlparse.urlparse(self.full_url).netloc
return 1
def merge_url(self, i_url, i_root):
#different way for local files or named hash navigation
_url_netloc = ""
_root_netloc = ""
_url_scheme = ""
_root_scheme = ""
if len(i_root) == 0:
return i_url
else:
_root_netloc = urlparse.urlparse(i_root).netloc
_root_scheme = urlparse.urlparse(i_root).scheme
if len(i_url) == 0:
return i_root
else:
_url_netloc = urlparse.urlparse(i_url).netloc
_url_scheme = urlparse.urlparse(i_url).scheme
#both parts defined
if (_url_netloc <> _root_netloc) & (len(_url_netloc) > 0):
#link to a different server
return None
if len(_url_netloc) == 0:
#relative path
_url_netloc = _root_netloc
_url_scheme = _root_scheme
#standardise
if i_root[-1] == "/":
i_root = i_root[:len(i_root)-1]
if i_url[0] == "/":
i_url = i_url[1:]
#both parts defined
#the _url_scheme is part of th i_url
if i_url.startswith(_url_scheme + "://" + _url_netloc) | i_url.startswith(_url_netloc):
return i_url
else:
return _url_scheme + "://" + _url_netloc + "/" + i_url
class link:
def __init__(self):
"Initialise an object"
self.place_from = place()
self.place_to = place()
self.caption = ""
import xml.sax.saxutils
class network:
def __init__(self):
"Initialise an object"
self.places = []
self.links = []
def add_place(self, i_place):
ret_place = self.find_place(i_place)
if ret_place is None:
self.places.append(i_place)
return i_place
else:
return ret_place
def find_place(self, i_place):
for i in self.places:
if i.full_url == i_place.full_url:
return i
return None
def add_link(self, i_place_from, i_place_to, i_caption, i_root_url = ""):
lnk = link()
if i_place_from.netloc <> i_place_to.netloc:
#ignore remote links to different servers
return None
if i_place_from.full_url == i_place_to.full_url:
#ignore self link
return None
if i_place_to.full_url == i_root_url:
#ignore back link to root
return None
if i_caption == "":
i_caption = self.escape_html(i_place_to.full_url)
#from a to places are on the same server
lnk.place_from=self.add_place(i_place_from)
lnk.place_to =self.add_place(i_place_to)
lnk.caption = i_caption
self.links.append(lnk)
return lnk
def has_page_from_role(self, i_page):
"If full url is found in links like from_place return 1"
for lnk in self.links:
if lnk.place_from.full_url == i_page.full_url:
return 1
return 0
def escape_html(self, i_link):
return xml.sax.saxutils.escape(i_link)
import sgmllib, urllib, sys
class site_parser(sgmllib.SGMLParser):
"Parsing the site for a href and titles"
def process_site(self, i_url, i_counter = 100, i_re_neg = ""):
"Get web content, i_re_neg is filter condition to avoid such a sites"
#print 'beg process_site'
self.loop_counter = self.loop_counter + 1
if self.root_netloc == "":
p = urlparse.urlparse(i_url)
self.root_netloc = p.netloc
self.root_url = i_url
self.url = i_url
self.negative_regexp = i_re_neg
try:
#problems when opening has context i.e. named
f = urllib.urlopen(i_url)
s = f.read()
self.parse(s)
#print "succ:", self.loop_counter, i_url
except IOError:
#print 'err', i_url, sys.exc_info()[1]
nicNedelej = 0
#for each page_to not in page_from do process_site
next_url = self.get_next_url()
#print 'after next url'
if (next_url is None) or (self.loop_counter > i_counter):
#print 'exit too many'
return
else:
#print 'beg recursion'
self.process_site(next_url, i_counter, i_re_neg)
#print 'end process_site'
def parse(self, s):
"Parse the given html as string."
self.feed(s)
self.close()
def __init__(self, verbose=0):
"Initialise an object, passing 'verbose' to the superclass."
sgmllib.SGMLParser.__init__(self, verbose)
self.network = network()
self.page_counter = 0
self.root_netloc = ""
self.exp_branch = []
self.str_indent = ""
self.tag_value = []
self.curr_ahref = ""
self.curr_title = ""
self.curr_ahref_title = ""
self.interesting_tag = 0
self.loop_counter = 0
self.negative_regexp = ""
self.root_url = ""
def start_a(self, attributes):
"Process a hyperlink and its 'attributes'. We process only a with HREF"
self.curr_ahref = ""
self.curr_ahref_title = ""
self.tag_value = []
self.interesting_tag = 0
for name, value in attributes:
#print name, value
if name == "href":
self.curr_ahref = value
if self.curr_ahref[0] == "#":
#ignore self link
self.interesting_tag = 0
else:
self.interesting_tag = 1
if name == "title":
#A to IMG have data in title attribute
self.curr_ahref_title = value
def end_a(self):
#has no arguments
if self.interesting_tag == 0:
#some A tags wear only NAME attribute
return
if len(self.curr_ahref) > 0:
#standardise named hash locations
if self.curr_ahref[0] == "#":
self.curr_ahref = self.url + self.curr_ahref
title = string.join(self.tag_value, "").strip()
if title == "":
title = self.curr_ahref_title
#create the objects
p_from = place()
res_from = p_from.create("", self.url, self.negative_regexp, self.curr_title)
p_to = place()
res_to = p_to.create(self.curr_ahref, self.url, self.negative_regexp)
#print "res_from, res_to", res_from, res_to
self.interesting_tag = 0
if res_from == 1 & res_to == 1:
#put it into the list only if from and to are correct
self.network.add_link(p_from, p_to, title, self.root_url)
def start_title(self, attributes):
#title data are not available yet
self.interesting_tag = 1
self.tag_value = []
def end_title(self):
#has no arguments
self.interesting_tag = 0
self.curr_title = string.join(self.tag_value, "").strip()
def handle_data(self, data):
#reading area between start tag / end tag
if self.interesting_tag == 1:
self.tag_value.append(data)
def print_hyperlinks(self):
"Return the list of hyperlinks."
for lnk in self.network.links:
#print lnk.place_from.full_url, "->", lnk.place_to.full_url, " aka ", lnk.caption
print ""
def export_item(self, i_place, i_caption, i_indent, i_parent_url = "", i_root_url = ""):
#is it branch?
#was it already exported?
#i_indet - starting count at depth = 1
if (i_place.full_url == i_parent_url) | (i_place.full_url == i_root_url):
#ignore links to parent and root page
return
if i_root_url == "":
#root must be exported for the first time
i_root_url = self.root_url
is_branch = self.network.has_page_from_role(i_place)
was_exported = 0
self.str_indent = " " * i_indent
for exp in self.exp_branches:
#find url
if exp.full_url == i_place.full_url:
was_exported = 1
break
if was_exported == 1:
self.printx("")
self.printx(" " + i_caption + "")
self.printx("")
return
#a new item
if is_branch == 1:
if i_indent == 1:
#first branch has special title not from link
i_caption = i_place.title
self.exp_branches.append(i_place)
self.printx("")
self.printx(" " + i_caption + "")
freeze_indent = self.str_indent
for lnk in self.network.links:
if lnk.place_from.full_url == i_place.full_url:
self.export_item(lnk.place_to, lnk.caption, i_indent + 1, i_place.full_url, i_root_url)
self.str_indent = freeze_indent
self.printx("")
else:
self.printx("")
self.printx(" " + i_caption + "")
self.printx("")
def export_hyperlinks(self):
"Create xml export."
self.exp_branches = []
self.printx("""
""")
self.export_item(self.network.links[0].place_from, self.network.links[0].caption, 1)
self.str_indent = ''
self.printx("")
def get_next_url(self):
"Find next url from place_to that is not in place_from"
start_cnt = self.page_counter
for pg in self.network.places[start_cnt:]:
self.page_counter = self.page_counter + 1
#rules for adding places is in add_link method
if not self.network.has_page_from_role(pg):
return pg.full_url
return None
def printx(self, i_str):
#indent and apply html complient string
print self.str_indent + i_str
# Try and process the page.
myparser = site_parser()
#myparser.process_site('http://sql2008rs:81/dokuwiki/doku.php', 100, '.*(backlink|feed.php)')
myparser.process_site(sys.argv[1], sys.argv[2], sys.argv[3])
# Get the hyperlinks.
myparser.export_hyperlinks()