import string, urlparse, re, sys class place: def create(self, i_url, i_root, i_re_neg = "", i_title = ""): #i_re_neg = reg expression that must be false to return success = 1 #check regexp condition and existence of i_url on the same server with i_root #in negative case it returns 0 self.relative_url = i_url self.title = i_title self.full_url = self.merge_url(i_url, i_root) if self.full_url is None: return 0 if len(i_re_neg) > 0: #apply reg exp condition r = re.compile(i_re_neg) if not(r.match(self.full_url) is None): #the i_re_neg is found #print "found", i_re_neg, "in", self.full_url return 0 #print "full_url:", self.full_url self.netloc = urlparse.urlparse(self.full_url).netloc return 1 def merge_url(self, i_url, i_root): #different way for local files or named hash navigation _url_netloc = "" _root_netloc = "" _url_scheme = "" _root_scheme = "" if len(i_root) == 0: return i_url else: _root_netloc = urlparse.urlparse(i_root).netloc _root_scheme = urlparse.urlparse(i_root).scheme if len(i_url) == 0: return i_root else: _url_netloc = urlparse.urlparse(i_url).netloc _url_scheme = urlparse.urlparse(i_url).scheme #both parts defined if (_url_netloc <> _root_netloc) & (len(_url_netloc) > 0): #link to a different server return None if len(_url_netloc) == 0: #relative path _url_netloc = _root_netloc _url_scheme = _root_scheme #standardise if i_root[-1] == "/": i_root = i_root[:len(i_root)-1] if i_url[0] == "/": i_url = i_url[1:] #both parts defined #the _url_scheme is part of th i_url if i_url.startswith(_url_scheme + "://" + _url_netloc) | i_url.startswith(_url_netloc): return i_url else: return _url_scheme + "://" + _url_netloc + "/" + i_url class link: def __init__(self): "Initialise an object" self.place_from = place() self.place_to = place() self.caption = "" import xml.sax.saxutils class network: def __init__(self): "Initialise an object" self.places = [] self.links = [] def add_place(self, i_place): ret_place = self.find_place(i_place) if ret_place is None: self.places.append(i_place) return i_place else: return ret_place def find_place(self, i_place): for i in self.places: if i.full_url == i_place.full_url: return i return None def add_link(self, i_place_from, i_place_to, i_caption, i_root_url = ""): lnk = link() if i_place_from.netloc <> i_place_to.netloc: #ignore remote links to different servers return None if i_place_from.full_url == i_place_to.full_url: #ignore self link return None if i_place_to.full_url == i_root_url: #ignore back link to root return None if i_caption == "": i_caption = self.escape_html(i_place_to.full_url) #from a to places are on the same server lnk.place_from=self.add_place(i_place_from) lnk.place_to =self.add_place(i_place_to) lnk.caption = i_caption self.links.append(lnk) return lnk def has_page_from_role(self, i_page): "If full url is found in links like from_place return 1" for lnk in self.links: if lnk.place_from.full_url == i_page.full_url: return 1 return 0 def escape_html(self, i_link): return xml.sax.saxutils.escape(i_link) import sgmllib, urllib, sys class site_parser(sgmllib.SGMLParser): "Parsing the site for a href and titles" def process_site(self, i_url, i_counter = 100, i_re_neg = ""): "Get web content, i_re_neg is filter condition to avoid such a sites" #print 'beg process_site' self.loop_counter = self.loop_counter + 1 if self.root_netloc == "": p = urlparse.urlparse(i_url) self.root_netloc = p.netloc self.root_url = i_url self.url = i_url self.negative_regexp = i_re_neg try: #problems when opening has context i.e. named f = urllib.urlopen(i_url) s = f.read() self.parse(s) #print "succ:", self.loop_counter, i_url except IOError: #print 'err', i_url, sys.exc_info()[1] nicNedelej = 0 #for each page_to not in page_from do process_site next_url = self.get_next_url() #print 'after next url' if (next_url is None) or (self.loop_counter > i_counter): #print 'exit too many' return else: #print 'beg recursion' self.process_site(next_url, i_counter, i_re_neg) #print 'end process_site' def parse(self, s): "Parse the given html as string." self.feed(s) self.close() def __init__(self, verbose=0): "Initialise an object, passing 'verbose' to the superclass." sgmllib.SGMLParser.__init__(self, verbose) self.network = network() self.page_counter = 0 self.root_netloc = "" self.exp_branch = [] self.str_indent = "" self.tag_value = [] self.curr_ahref = "" self.curr_title = "" self.curr_ahref_title = "" self.interesting_tag = 0 self.loop_counter = 0 self.negative_regexp = "" self.root_url = "" def start_a(self, attributes): "Process a hyperlink and its 'attributes'. We process only a with HREF" self.curr_ahref = "" self.curr_ahref_title = "" self.tag_value = [] self.interesting_tag = 0 for name, value in attributes: #print name, value if name == "href": self.curr_ahref = value if self.curr_ahref[0] == "#": #ignore self link self.interesting_tag = 0 else: self.interesting_tag = 1 if name == "title": #A to IMG have data in title attribute self.curr_ahref_title = value def end_a(self): #has no arguments if self.interesting_tag == 0: #some A tags wear only NAME attribute return if len(self.curr_ahref) > 0: #standardise named hash locations if self.curr_ahref[0] == "#": self.curr_ahref = self.url + self.curr_ahref title = string.join(self.tag_value, "").strip() if title == "": title = self.curr_ahref_title #create the objects p_from = place() res_from = p_from.create("", self.url, self.negative_regexp, self.curr_title) p_to = place() res_to = p_to.create(self.curr_ahref, self.url, self.negative_regexp) #print "res_from, res_to", res_from, res_to self.interesting_tag = 0 if res_from == 1 & res_to == 1: #put it into the list only if from and to are correct self.network.add_link(p_from, p_to, title, self.root_url) def start_title(self, attributes): #title data are not available yet self.interesting_tag = 1 self.tag_value = [] def end_title(self): #has no arguments self.interesting_tag = 0 self.curr_title = string.join(self.tag_value, "").strip() def handle_data(self, data): #reading area between start tag / end tag if self.interesting_tag == 1: self.tag_value.append(data) def print_hyperlinks(self): "Return the list of hyperlinks." for lnk in self.network.links: #print lnk.place_from.full_url, "->", lnk.place_to.full_url, " aka ", lnk.caption print "" def export_item(self, i_place, i_caption, i_indent, i_parent_url = "", i_root_url = ""): #is it branch? #was it already exported? #i_indet - starting count at depth = 1 if (i_place.full_url == i_parent_url) | (i_place.full_url == i_root_url): #ignore links to parent and root page return if i_root_url == "": #root must be exported for the first time i_root_url = self.root_url is_branch = self.network.has_page_from_role(i_place) was_exported = 0 self.str_indent = " " * i_indent for exp in self.exp_branches: #find url if exp.full_url == i_place.full_url: was_exported = 1 break if was_exported == 1: self.printx("") self.printx(" " + i_caption + "") self.printx("") return #a new item if is_branch == 1: if i_indent == 1: #first branch has special title not from link i_caption = i_place.title self.exp_branches.append(i_place) self.printx("") self.printx(" " + i_caption + "") freeze_indent = self.str_indent for lnk in self.network.links: if lnk.place_from.full_url == i_place.full_url: self.export_item(lnk.place_to, lnk.caption, i_indent + 1, i_place.full_url, i_root_url) self.str_indent = freeze_indent self.printx("") else: self.printx("") self.printx(" " + i_caption + "") self.printx("") def export_hyperlinks(self): "Create xml export." self.exp_branches = [] self.printx(""" """) self.export_item(self.network.links[0].place_from, self.network.links[0].caption, 1) self.str_indent = '' self.printx("") def get_next_url(self): "Find next url from place_to that is not in place_from" start_cnt = self.page_counter for pg in self.network.places[start_cnt:]: self.page_counter = self.page_counter + 1 #rules for adding places is in add_link method if not self.network.has_page_from_role(pg): return pg.full_url return None def printx(self, i_str): #indent and apply html complient string print self.str_indent + i_str # Try and process the page. myparser = site_parser() #myparser.process_site('http://sql2008rs:81/dokuwiki/doku.php', 100, '.*(backlink|feed.php)') myparser.process_site(sys.argv[1], sys.argv[2], sys.argv[3]) # Get the hyperlinks. myparser.export_hyperlinks()