1import string, urlparse, re, sys
2
3class place:
4
5  def create(self, i_url, i_root, i_re_neg = "", i_title = ""):
6    #i_re_neg = reg expression that must be false to return success = 1
7    #check regexp condition and existence of i_url on the same server with i_root
8    #in negative case it returns 0
9    self.relative_url = i_url
10    self.title = i_title
11    self.full_url = self.merge_url(i_url, i_root)
12    if self.full_url is None:
13      return 0
14
15    if len(i_re_neg) > 0:
16      #apply reg exp condition
17      r = re.compile(i_re_neg)
18      if not(r.match(self.full_url) is None):
19        #the i_re_neg is found
20        #print "found", i_re_neg, "in", self.full_url
21        return 0
22
23    #print "full_url:", self.full_url
24    self.netloc = urlparse.urlparse(self.full_url).netloc
25    return 1
26
27  def merge_url(self, i_url, i_root):
28  #different way for local files or named hash navigation
29    _url_netloc  = ""
30    _root_netloc = ""
31    _url_scheme  = ""
32    _root_scheme = ""
33
34    if len(i_root) == 0:
35      return i_url
36    else:
37      _root_netloc = urlparse.urlparse(i_root).netloc
38      _root_scheme = urlparse.urlparse(i_root).scheme
39
40    if len(i_url) == 0:
41      return i_root
42    else:
43      _url_netloc = urlparse.urlparse(i_url).netloc
44      _url_scheme = urlparse.urlparse(i_url).scheme
45
46    #both parts defined
47    if (_url_netloc <> _root_netloc) & (len(_url_netloc) > 0):
48      #link to a different server
49      return None
50
51    if len(_url_netloc) == 0:
52      #relative path
53      _url_netloc = _root_netloc
54      _url_scheme = _root_scheme
55
56    #standardise
57    if i_root[-1] == "/":
58      i_root = i_root[:len(i_root)-1]
59
60    if i_url[0] == "/":
61      i_url = i_url[1:]
62
63    #both parts defined
64    #the _url_scheme is part of th i_url
65    if i_url.startswith(_url_scheme + "://" + _url_netloc) | i_url.startswith(_url_netloc):
66      return i_url
67    else:
68      return _url_scheme + "://" + _url_netloc + "/" + i_url
69
70class link:
71  def __init__(self):
72    "Initialise an object"
73    self.place_from = place()
74    self.place_to   = place()
75    self.caption = ""
76
77import xml.sax.saxutils
78
79class network:
80
81  def __init__(self):
82    "Initialise an object"
83    self.places = []
84    self.links  = []
85
86  def add_place(self, i_place):
87    ret_place = self.find_place(i_place)
88    if ret_place is None:
89      self.places.append(i_place)
90      return i_place
91    else:
92      return ret_place
93
94  def find_place(self, i_place):
95    for i in self.places:
96      if i.full_url == i_place.full_url:
97        return i
98    return None
99
100  def add_link(self, i_place_from, i_place_to, i_caption, i_root_url = ""):
101    lnk = link()
102    if i_place_from.netloc <> i_place_to.netloc:
103      #ignore remote links to different servers
104      return None
105
106    if i_place_from.full_url == i_place_to.full_url:
107      #ignore self link
108      return None
109
110    if i_place_to.full_url == i_root_url:
111      #ignore back link to root
112      return None
113
114    if i_caption == "":
115      i_caption = self.escape_html(i_place_to.full_url)
116
117    #from a to places are on the same server
118    lnk.place_from=self.add_place(i_place_from)
119    lnk.place_to  =self.add_place(i_place_to)
120    lnk.caption   = i_caption
121    self.links.append(lnk)
122    return lnk
123
124  def has_page_from_role(self, i_page):
125    "If full url is found in links like from_place return 1"
126    for lnk in self.links:
127      if lnk.place_from.full_url == i_page.full_url:
128        return 1
129    return 0
130
131  def escape_html(self, i_link):
132    return xml.sax.saxutils.escape(i_link)
133
134
135import sgmllib, urllib, sys
136class site_parser(sgmllib.SGMLParser):
137    "Parsing the site for a href and titles"
138
139    def process_site(self, i_url, i_counter = 100, i_re_neg = ""):
140      "Get web content, i_re_neg is filter condition to avoid such a sites"
141
142      #print 'beg process_site'
143      self.loop_counter = self.loop_counter + 1
144
145      if self.root_netloc == "":
146        p = urlparse.urlparse(i_url)
147        self.root_netloc = p.netloc
148        self.root_url = i_url
149
150      self.url = i_url
151      self.negative_regexp = i_re_neg
152
153      try:
154        #problems when opening has context i.e. named
155        f = urllib.urlopen(i_url)
156        s = f.read()
157        self.parse(s)
158        #print "succ:", self.loop_counter, i_url
159      except IOError:
160        #print 'err', i_url, sys.exc_info()[1]
161        nicNedelej = 0
162
163      #for each page_to not in page_from do process_site
164      next_url = self.get_next_url()
165      #print 'after next url'
166      if (next_url is None) or (self.loop_counter > i_counter):
167        #print 'exit too many'
168        return
169      else:
170        #print 'beg recursion'
171        self.process_site(next_url, i_counter, i_re_neg)
172
173      #print 'end process_site'
174
175    def parse(self, s):
176      "Parse the given html as string."
177      self.feed(s)
178      self.close()
179
180    def __init__(self, verbose=0):
181      "Initialise an object, passing 'verbose' to the superclass."
182      sgmllib.SGMLParser.__init__(self, verbose)
183      self.network = network()
184      self.page_counter = 0
185      self.root_netloc = ""
186      self.exp_branch = []
187      self.str_indent = ""
188      self.tag_value = []
189      self.curr_ahref = ""
190      self.curr_title = ""
191      self.curr_ahref_title = ""
192      self.interesting_tag = 0
193      self.loop_counter = 0
194      self.negative_regexp = ""
195      self.root_url = ""
196
197    def start_a(self, attributes):
198      "Process a hyperlink and its 'attributes'. We process only a with HREF"
199      self.curr_ahref   = ""
200      self.curr_ahref_title = ""
201      self.tag_value = []
202      self.interesting_tag = 0
203      for name, value in attributes:
204        #print name, value
205        if name == "href":
206          self.curr_ahref = value
207          if self.curr_ahref[0] == "#":
208            #ignore self link
209            self.interesting_tag = 0
210          else:
211            self.interesting_tag = 1
212        if name == "title":
213          #A to IMG have data in title attribute
214          self.curr_ahref_title = value
215
216    def end_a(self):
217      #has no arguments
218      if self.interesting_tag == 0:
219        #some A tags wear only NAME attribute
220        return
221
222      if len(self.curr_ahref) > 0:
223        #standardise named hash locations
224        if self.curr_ahref[0] == "#":
225          self.curr_ahref = self.url + self.curr_ahref
226
227      title = string.join(self.tag_value, "").strip()
228      if title == "":
229        title = self.curr_ahref_title
230
231      #create the objects
232      p_from = place()
233      res_from = p_from.create("", self.url, self.negative_regexp, self.curr_title)
234      p_to     = place()
235      res_to   = p_to.create(self.curr_ahref, self.url, self.negative_regexp)
236      #print "res_from, res_to", res_from, res_to
237      self.interesting_tag = 0
238
239      if res_from == 1 & res_to == 1:
240        #put it into the list only if from and to are correct
241        self.network.add_link(p_from, p_to, title, self.root_url)
242
243    def start_title(self, attributes):
244      #title data are not available yet
245      self.interesting_tag = 1
246      self.tag_value = []
247
248
249    def end_title(self):
250      #has no arguments
251      self.interesting_tag = 0
252      self.curr_title = string.join(self.tag_value, "").strip()
253
254
255    def handle_data(self, data):
256      #reading area between start tag / end tag
257      if self.interesting_tag == 1:
258         self.tag_value.append(data)
259
260
261    def print_hyperlinks(self):
262      "Return the list of hyperlinks."
263      for lnk in self.network.links:
264        #print lnk.place_from.full_url, "->", lnk.place_to.full_url, " aka ", lnk.caption
265        print "<link from='" + lnk.place_from.full_url + "' to='" + lnk.place_to.full_url + "' caption='" + lnk.caption + "'>"
266
267
268    def export_item(self, i_place, i_caption, i_indent, i_parent_url = "", i_root_url = ""):
269      #is it branch?
270      #was it already exported?
271      #i_indet - starting count at depth = 1
272
273      if (i_place.full_url == i_parent_url) | (i_place.full_url == i_root_url):
274        #ignore links to parent and root page
275        return
276
277      if i_root_url == "":
278        #root must be exported for the first time
279        i_root_url = self.root_url
280
281      is_branch = self.network.has_page_from_role(i_place)
282
283      was_exported = 0
284      self.str_indent = "  " * i_indent
285      for exp in self.exp_branches:
286        #find url
287        if exp.full_url == i_place.full_url:
288          was_exported = 1
289          break
290
291      if was_exported == 1:
292        self.printx("<branchLink id=\"" + self.network.escape_html(exp.full_url) + "\" link=\"" + self.network.escape_html(exp.full_url) + "\">")
293        self.printx("  <branchText>" + i_caption + "</branchText>")
294        self.printx("</branchLink>")
295        return
296
297      #a new item
298      if is_branch == 1:
299        if i_indent == 1:
300          #first branch has special title not from link
301          i_caption = i_place.title
302
303        self.exp_branches.append(i_place)
304        self.printx("<branch id=\"" + self.network.escape_html(i_place.full_url) + "\" link=\"" + self.network.escape_html(i_place.full_url) + "\">")
305        self.printx("  <branchText>" + i_caption + "</branchText>")
306        freeze_indent = self.str_indent
307
308        for lnk in self.network.links:
309          if lnk.place_from.full_url == i_place.full_url:
310            self.export_item(lnk.place_to, lnk.caption, i_indent + 1, i_place.full_url, i_root_url)
311
312        self.str_indent = freeze_indent
313        self.printx("</branch>")
314      else:
315        self.printx("<leaf link=\"" + self.network.escape_html(i_place.full_url) + "\">")
316        self.printx("  <leafText>" + i_caption + "</leafText>")
317        self.printx("</leaf>")
318
319
320    def export_hyperlinks(self):
321      "Create xml export."
322      self.exp_branches = []
323      self.printx("""<?xml version='1.0' encoding='windows-1250'?>
324<?xml-stylesheet type='text/xsl' href='xmlTree.xsl'?>
325<!DOCTYPE tree SYSTEM 'tree.dtd'>
326<tree>""")
327      self.export_item(self.network.links[0].place_from, self.network.links[0].caption, 1)
328      self.str_indent = ''
329      self.printx("</tree>")
330
331
332    def get_next_url(self):
333      "Find next url from place_to that is not in place_from"
334      start_cnt = self.page_counter
335      for pg in self.network.places[start_cnt:]:
336        self.page_counter = self.page_counter + 1
337        #rules for adding places is in add_link method
338        if not self.network.has_page_from_role(pg):
339          return pg.full_url
340
341      return None
342
343
344
345    def printx(self, i_str):
346      #indent and apply html complient string
347      print self.str_indent + i_str
348
349
350# Try and process the page.
351myparser = site_parser()
352#myparser.process_site('http://sql2008rs:81/dokuwiki/doku.php', 100, '.*(backlink|feed.php)')
353myparser.process_site(sys.argv[1], sys.argv[2], sys.argv[3])
354
355# Get the hyperlinks.
356myparser.export_hyperlinks()
357
358