import string, urlparse, re, sys

class place:
  
  def create(self, i_url, i_root, i_re_neg = "", i_title = ""):
    #i_re_neg = reg expression that must be false to return success = 1
    #check regexp condition and existence of i_url on the same server with i_root
    #in negative case it returns 0
    self.relative_url = i_url
    self.title = i_title    
    self.full_url = self.merge_url(i_url, i_root)    
    if self.full_url is None:
      return 0

    if len(i_re_neg) > 0:
      #apply reg exp condition
      r = re.compile(i_re_neg)
      if not(r.match(self.full_url) is None):
        #the i_re_neg is found
        #print "found", i_re_neg, "in", self.full_url
        return 0
   
    #print "full_url:", self.full_url       
    self.netloc = urlparse.urlparse(self.full_url).netloc        
    return 1
    
  def merge_url(self, i_url, i_root):
  #different way for local files or named hash navigation
    _url_netloc  = ""    
    _root_netloc = ""
    _url_scheme  = ""
    _root_scheme = ""
      
    if len(i_root) == 0:          
      return i_url
    else:      
      _root_netloc = urlparse.urlparse(i_root).netloc
      _root_scheme = urlparse.urlparse(i_root).scheme
            
    if len(i_url) == 0:
      return i_root
    else:
      _url_netloc = urlparse.urlparse(i_url).netloc
      _url_scheme = urlparse.urlparse(i_url).scheme
  
    #both parts defined
    if (_url_netloc <> _root_netloc) & (len(_url_netloc) > 0):
      #link to a different server
      return None
      
    if len(_url_netloc) == 0:
      #relative path
      _url_netloc = _root_netloc
      _url_scheme = _root_scheme  

    #standardise
    if i_root[-1] == "/":
      i_root = i_root[:len(i_root)-1]        
        
    if i_url[0] == "/":
      i_url = i_url[1:]

    #both parts defined
    #the _url_scheme is part of th i_url
    if i_url.startswith(_url_scheme + "://" + _url_netloc) | i_url.startswith(_url_netloc):
      return i_url 
    else:
      return _url_scheme + "://" + _url_netloc + "/" + i_url

class link:
  def __init__(self):
    "Initialise an object"
    self.place_from = place()
    self.place_to   = place()
    self.caption = ""

import xml.sax.saxutils

class network:

  def __init__(self):
    "Initialise an object"
    self.places = []
    self.links  = []

  def add_place(self, i_place):
    ret_place = self.find_place(i_place)    
    if ret_place is None:    
      self.places.append(i_place)
      return i_place
    else:
      return ret_place  
          
  def find_place(self, i_place):
    for i in self.places:    
      if i.full_url == i_place.full_url:
        return i     
    return None
    
  def add_link(self, i_place_from, i_place_to, i_caption, i_root_url = ""):
    lnk = link()
    if i_place_from.netloc <> i_place_to.netloc:
      #ignore remote links to different servers        
      return None

    if i_place_from.full_url == i_place_to.full_url:
      #ignore self link        
      return None    
	
    if i_place_to.full_url == i_root_url:
      #ignore back link to root
      return None
      
    if i_caption == "":
      i_caption = self.escape_html(i_place_to.full_url)
        
    #from a to places are on the same server      
    lnk.place_from=self.add_place(i_place_from)
    lnk.place_to  =self.add_place(i_place_to)
    lnk.caption   = i_caption
    self.links.append(lnk)
    return lnk
  
  def has_page_from_role(self, i_page):
    "If full url is found in links like from_place return 1"
    for lnk in self.links:      
      if lnk.place_from.full_url == i_page.full_url:
        return 1
    return 0

  def escape_html(self, i_link):
    return xml.sax.saxutils.escape(i_link)
              
 
import sgmllib, urllib, sys
class site_parser(sgmllib.SGMLParser):
    "Parsing the site for a href and titles"

    def process_site(self, i_url, i_counter = 100, i_re_neg = ""):      
      "Get web content, i_re_neg is filter condition to avoid such a sites"
               
      #print 'beg process_site'
      self.loop_counter = self.loop_counter + 1      
      
      if self.root_netloc == "":
        p = urlparse.urlparse(i_url)
        self.root_netloc = p.netloc
        self.root_url = i_url
      
      self.url = i_url      
      self.negative_regexp = i_re_neg
      
      try:
        #problems when opening has context i.e. named
        f = urllib.urlopen(i_url)      
        s = f.read()
        self.parse(s)
        #print "succ:", self.loop_counter, i_url
      except IOError:      
        #print 'err', i_url, sys.exc_info()[1]
        nicNedelej = 0
          
      #for each page_to not in page_from do process_site
      next_url = self.get_next_url()
      #print 'after next url'
      if (next_url is None) or (self.loop_counter > i_counter):        
        #print 'exit too many'
        return
      else:
        #print 'beg recursion'
        self.process_site(next_url, i_counter, i_re_neg)
        
      #print 'end process_site'  

    def parse(self, s):
      "Parse the given html as string."
      self.feed(s)
      self.close()

    def __init__(self, verbose=0):
      "Initialise an object, passing 'verbose' to the superclass."
      sgmllib.SGMLParser.__init__(self, verbose)
      self.network = network()
      self.page_counter = 0
      self.root_netloc = ""
      self.exp_branch = []
      self.str_indent = ""
      self.tag_value = []
      self.curr_ahref = ""
      self.curr_title = ""
      self.curr_ahref_title = ""      
      self.interesting_tag = 0
      self.loop_counter = 0
      self.negative_regexp = ""
      self.root_url = ""

    def start_a(self, attributes):
      "Process a hyperlink and its 'attributes'. We process only a with HREF"
      self.curr_ahref   = ""
      self.curr_ahref_title = ""                
      self.tag_value = []      
      self.interesting_tag = 0
      for name, value in attributes:
        #print name, value        
        if name == "href":
          self.curr_ahref = value
          if self.curr_ahref[0] == "#":
            #ignore self link
            self.interesting_tag = 0
          else:
            self.interesting_tag = 1
        if name == "title":
          #A to IMG have data in title attribute
          self.curr_ahref_title = value                                      

    def end_a(self):
      #has no arguments
      if self.interesting_tag == 0:
        #some A tags wear only NAME attribute
        return

      if len(self.curr_ahref) > 0:
        #standardise named hash locations
        if self.curr_ahref[0] == "#":
          self.curr_ahref = self.url + self.curr_ahref
      
      title = string.join(self.tag_value, "").strip()
      if title == "":
        title = self.curr_ahref_title 
        
      #create the objects
      p_from = place()
      res_from = p_from.create("", self.url, self.negative_regexp, self.curr_title)
      p_to     = place()
      res_to   = p_to.create(self.curr_ahref, self.url, self.negative_regexp)
      #print "res_from, res_to", res_from, res_to
      self.interesting_tag = 0
      
      if res_from == 1 & res_to == 1: 
        #put it into the list only if from and to are correct
        self.network.add_link(p_from, p_to, title, self.root_url)                               

    def start_title(self, attributes):    
      #title data are not available yet
      self.interesting_tag = 1
      self.tag_value = []


    def end_title(self):
      #has no arguments
      self.interesting_tag = 0
      self.curr_title = string.join(self.tag_value, "").strip()


    def handle_data(self, data):
      #reading area between start tag / end tag
      if self.interesting_tag == 1:
         self.tag_value.append(data)
         

    def print_hyperlinks(self):
      "Return the list of hyperlinks."
      for lnk in self.network.links:            
        #print lnk.place_from.full_url, "->", lnk.place_to.full_url, " aka ", lnk.caption
        print "<link from='" + lnk.place_from.full_url + "' to='" + lnk.place_to.full_url + "' caption='" + lnk.caption + "'>" 


    def export_item(self, i_place, i_caption, i_indent, i_parent_url = "", i_root_url = ""):
      #is it branch?
      #was it already exported?
      #i_indet - starting count at depth = 1
      
      if (i_place.full_url == i_parent_url) | (i_place.full_url == i_root_url):
        #ignore links to parent and root page
        return
      
      if i_root_url == "":
        #root must be exported for the first time
        i_root_url = self.root_url
            
      is_branch = self.network.has_page_from_role(i_place)
      
      was_exported = 0
      self.str_indent = "  " * i_indent
      for exp in self.exp_branches:
        #find url
        if exp.full_url == i_place.full_url:
          was_exported = 1
          break 
        
      if was_exported == 1:
        self.printx("<branchLink id=\"" + self.network.escape_html(exp.full_url) + "\" link=\"" + self.network.escape_html(exp.full_url) + "\">")
        self.printx("  <branchText>" + i_caption + "</branchText>")
        self.printx("</branchLink>")
        return
        
      #a new item
      if is_branch == 1:
        if i_indent == 1:
          #first branch has special title not from link
          i_caption = i_place.title
          
        self.exp_branches.append(i_place)
        self.printx("<branch id=\"" + self.network.escape_html(i_place.full_url) + "\" link=\"" + self.network.escape_html(i_place.full_url) + "\">")
        self.printx("  <branchText>" + i_caption + "</branchText>")
        freeze_indent = self.str_indent

        for lnk in self.network.links:
          if lnk.place_from.full_url == i_place.full_url:
            self.export_item(lnk.place_to, lnk.caption, i_indent + 1, i_place.full_url, i_root_url)
        
        self.str_indent = freeze_indent
        self.printx("</branch>")            
      else:
        self.printx("<leaf link=\"" + self.network.escape_html(i_place.full_url) + "\">")
        self.printx("  <leafText>" + i_caption + "</leafText>")
        self.printx("</leaf>")
        
        
    def export_hyperlinks(self):
      "Create xml export."
      self.exp_branches = []
      self.printx("""<?xml version='1.0' encoding='windows-1250'?>
<?xml-stylesheet type='text/xsl' href='xmlTree.xsl'?>
<!DOCTYPE tree SYSTEM 'tree.dtd'>
<tree>""")
      self.export_item(self.network.links[0].place_from, self.network.links[0].caption, 1)
      self.str_indent = ''
      self.printx("</tree>")


    def get_next_url(self):
      "Find next url from place_to that is not in place_from"
      start_cnt = self.page_counter
      for pg in self.network.places[start_cnt:]:
        self.page_counter = self.page_counter + 1        
        #rules for adding places is in add_link method
        if not self.network.has_page_from_role(pg):          
          return pg.full_url
      
      return None

    
    def printx(self, i_str):    
      #indent and apply html complient string
      print self.str_indent + i_str
    

# Try and process the page.
myparser = site_parser()
#myparser.process_site('http://sql2008rs:81/dokuwiki/doku.php', 100, '.*(backlink|feed.php)')
myparser.process_site(sys.argv[1], sys.argv[2], sys.argv[3])

# Get the hyperlinks.
myparser.export_hyperlinks()