1import string, urlparse, re, sys 2 3class place: 4 5 def create(self, i_url, i_root, i_re_neg = "", i_title = ""): 6 #i_re_neg = reg expression that must be false to return success = 1 7 #check regexp condition and existence of i_url on the same server with i_root 8 #in negative case it returns 0 9 self.relative_url = i_url 10 self.title = i_title 11 self.full_url = self.merge_url(i_url, i_root) 12 if self.full_url is None: 13 return 0 14 15 if len(i_re_neg) > 0: 16 #apply reg exp condition 17 r = re.compile(i_re_neg) 18 if not(r.match(self.full_url) is None): 19 #the i_re_neg is found 20 #print "found", i_re_neg, "in", self.full_url 21 return 0 22 23 #print "full_url:", self.full_url 24 self.netloc = urlparse.urlparse(self.full_url).netloc 25 return 1 26 27 def merge_url(self, i_url, i_root): 28 #different way for local files or named hash navigation 29 _url_netloc = "" 30 _root_netloc = "" 31 _url_scheme = "" 32 _root_scheme = "" 33 34 if len(i_root) == 0: 35 return i_url 36 else: 37 _root_netloc = urlparse.urlparse(i_root).netloc 38 _root_scheme = urlparse.urlparse(i_root).scheme 39 40 if len(i_url) == 0: 41 return i_root 42 else: 43 _url_netloc = urlparse.urlparse(i_url).netloc 44 _url_scheme = urlparse.urlparse(i_url).scheme 45 46 #both parts defined 47 if (_url_netloc <> _root_netloc) & (len(_url_netloc) > 0): 48 #link to a different server 49 return None 50 51 if len(_url_netloc) == 0: 52 #relative path 53 _url_netloc = _root_netloc 54 _url_scheme = _root_scheme 55 56 #standardise 57 if i_root[-1] == "/": 58 i_root = i_root[:len(i_root)-1] 59 60 if i_url[0] == "/": 61 i_url = i_url[1:] 62 63 #both parts defined 64 #the _url_scheme is part of th i_url 65 if i_url.startswith(_url_scheme + "://" + _url_netloc) | i_url.startswith(_url_netloc): 66 return i_url 67 else: 68 return _url_scheme + "://" + _url_netloc + "/" + i_url 69 70class link: 71 def __init__(self): 72 "Initialise an object" 73 self.place_from = place() 74 self.place_to = place() 75 self.caption = "" 76 77import xml.sax.saxutils 78 79class network: 80 81 def __init__(self): 82 "Initialise an object" 83 self.places = [] 84 self.links = [] 85 86 def add_place(self, i_place): 87 ret_place = self.find_place(i_place) 88 if ret_place is None: 89 self.places.append(i_place) 90 return i_place 91 else: 92 return ret_place 93 94 def find_place(self, i_place): 95 for i in self.places: 96 if i.full_url == i_place.full_url: 97 return i 98 return None 99 100 def add_link(self, i_place_from, i_place_to, i_caption, i_root_url = ""): 101 lnk = link() 102 if i_place_from.netloc <> i_place_to.netloc: 103 #ignore remote links to different servers 104 return None 105 106 if i_place_from.full_url == i_place_to.full_url: 107 #ignore self link 108 return None 109 110 if i_place_to.full_url == i_root_url: 111 #ignore back link to root 112 return None 113 114 if i_caption == "": 115 i_caption = self.escape_html(i_place_to.full_url) 116 117 #from a to places are on the same server 118 lnk.place_from=self.add_place(i_place_from) 119 lnk.place_to =self.add_place(i_place_to) 120 lnk.caption = i_caption 121 self.links.append(lnk) 122 return lnk 123 124 def has_page_from_role(self, i_page): 125 "If full url is found in links like from_place return 1" 126 for lnk in self.links: 127 if lnk.place_from.full_url == i_page.full_url: 128 return 1 129 return 0 130 131 def escape_html(self, i_link): 132 return xml.sax.saxutils.escape(i_link) 133 134 135import sgmllib, urllib, sys 136class site_parser(sgmllib.SGMLParser): 137 "Parsing the site for a href and titles" 138 139 def process_site(self, i_url, i_counter = 100, i_re_neg = ""): 140 "Get web content, i_re_neg is filter condition to avoid such a sites" 141 142 #print 'beg process_site' 143 self.loop_counter = self.loop_counter + 1 144 145 if self.root_netloc == "": 146 p = urlparse.urlparse(i_url) 147 self.root_netloc = p.netloc 148 self.root_url = i_url 149 150 self.url = i_url 151 self.negative_regexp = i_re_neg 152 153 try: 154 #problems when opening has context i.e. named 155 f = urllib.urlopen(i_url) 156 s = f.read() 157 self.parse(s) 158 #print "succ:", self.loop_counter, i_url 159 except IOError: 160 #print 'err', i_url, sys.exc_info()[1] 161 nicNedelej = 0 162 163 #for each page_to not in page_from do process_site 164 next_url = self.get_next_url() 165 #print 'after next url' 166 if (next_url is None) or (self.loop_counter > i_counter): 167 #print 'exit too many' 168 return 169 else: 170 #print 'beg recursion' 171 self.process_site(next_url, i_counter, i_re_neg) 172 173 #print 'end process_site' 174 175 def parse(self, s): 176 "Parse the given html as string." 177 self.feed(s) 178 self.close() 179 180 def __init__(self, verbose=0): 181 "Initialise an object, passing 'verbose' to the superclass." 182 sgmllib.SGMLParser.__init__(self, verbose) 183 self.network = network() 184 self.page_counter = 0 185 self.root_netloc = "" 186 self.exp_branch = [] 187 self.str_indent = "" 188 self.tag_value = [] 189 self.curr_ahref = "" 190 self.curr_title = "" 191 self.curr_ahref_title = "" 192 self.interesting_tag = 0 193 self.loop_counter = 0 194 self.negative_regexp = "" 195 self.root_url = "" 196 197 def start_a(self, attributes): 198 "Process a hyperlink and its 'attributes'. We process only a with HREF" 199 self.curr_ahref = "" 200 self.curr_ahref_title = "" 201 self.tag_value = [] 202 self.interesting_tag = 0 203 for name, value in attributes: 204 #print name, value 205 if name == "href": 206 self.curr_ahref = value 207 if self.curr_ahref[0] == "#": 208 #ignore self link 209 self.interesting_tag = 0 210 else: 211 self.interesting_tag = 1 212 if name == "title": 213 #A to IMG have data in title attribute 214 self.curr_ahref_title = value 215 216 def end_a(self): 217 #has no arguments 218 if self.interesting_tag == 0: 219 #some A tags wear only NAME attribute 220 return 221 222 if len(self.curr_ahref) > 0: 223 #standardise named hash locations 224 if self.curr_ahref[0] == "#": 225 self.curr_ahref = self.url + self.curr_ahref 226 227 title = string.join(self.tag_value, "").strip() 228 if title == "": 229 title = self.curr_ahref_title 230 231 #create the objects 232 p_from = place() 233 res_from = p_from.create("", self.url, self.negative_regexp, self.curr_title) 234 p_to = place() 235 res_to = p_to.create(self.curr_ahref, self.url, self.negative_regexp) 236 #print "res_from, res_to", res_from, res_to 237 self.interesting_tag = 0 238 239 if res_from == 1 & res_to == 1: 240 #put it into the list only if from and to are correct 241 self.network.add_link(p_from, p_to, title, self.root_url) 242 243 def start_title(self, attributes): 244 #title data are not available yet 245 self.interesting_tag = 1 246 self.tag_value = [] 247 248 249 def end_title(self): 250 #has no arguments 251 self.interesting_tag = 0 252 self.curr_title = string.join(self.tag_value, "").strip() 253 254 255 def handle_data(self, data): 256 #reading area between start tag / end tag 257 if self.interesting_tag == 1: 258 self.tag_value.append(data) 259 260 261 def print_hyperlinks(self): 262 "Return the list of hyperlinks." 263 for lnk in self.network.links: 264 #print lnk.place_from.full_url, "->", lnk.place_to.full_url, " aka ", lnk.caption 265 print "<link from='" + lnk.place_from.full_url + "' to='" + lnk.place_to.full_url + "' caption='" + lnk.caption + "'>" 266 267 268 def export_item(self, i_place, i_caption, i_indent, i_parent_url = "", i_root_url = ""): 269 #is it branch? 270 #was it already exported? 271 #i_indet - starting count at depth = 1 272 273 if (i_place.full_url == i_parent_url) | (i_place.full_url == i_root_url): 274 #ignore links to parent and root page 275 return 276 277 if i_root_url == "": 278 #root must be exported for the first time 279 i_root_url = self.root_url 280 281 is_branch = self.network.has_page_from_role(i_place) 282 283 was_exported = 0 284 self.str_indent = " " * i_indent 285 for exp in self.exp_branches: 286 #find url 287 if exp.full_url == i_place.full_url: 288 was_exported = 1 289 break 290 291 if was_exported == 1: 292 self.printx("<branchLink id=\"" + self.network.escape_html(exp.full_url) + "\" link=\"" + self.network.escape_html(exp.full_url) + "\">") 293 self.printx(" <branchText>" + i_caption + "</branchText>") 294 self.printx("</branchLink>") 295 return 296 297 #a new item 298 if is_branch == 1: 299 if i_indent == 1: 300 #first branch has special title not from link 301 i_caption = i_place.title 302 303 self.exp_branches.append(i_place) 304 self.printx("<branch id=\"" + self.network.escape_html(i_place.full_url) + "\" link=\"" + self.network.escape_html(i_place.full_url) + "\">") 305 self.printx(" <branchText>" + i_caption + "</branchText>") 306 freeze_indent = self.str_indent 307 308 for lnk in self.network.links: 309 if lnk.place_from.full_url == i_place.full_url: 310 self.export_item(lnk.place_to, lnk.caption, i_indent + 1, i_place.full_url, i_root_url) 311 312 self.str_indent = freeze_indent 313 self.printx("</branch>") 314 else: 315 self.printx("<leaf link=\"" + self.network.escape_html(i_place.full_url) + "\">") 316 self.printx(" <leafText>" + i_caption + "</leafText>") 317 self.printx("</leaf>") 318 319 320 def export_hyperlinks(self): 321 "Create xml export." 322 self.exp_branches = [] 323 self.printx("""<?xml version='1.0' encoding='windows-1250'?> 324<?xml-stylesheet type='text/xsl' href='xmlTree.xsl'?> 325<!DOCTYPE tree SYSTEM 'tree.dtd'> 326<tree>""") 327 self.export_item(self.network.links[0].place_from, self.network.links[0].caption, 1) 328 self.str_indent = '' 329 self.printx("</tree>") 330 331 332 def get_next_url(self): 333 "Find next url from place_to that is not in place_from" 334 start_cnt = self.page_counter 335 for pg in self.network.places[start_cnt:]: 336 self.page_counter = self.page_counter + 1 337 #rules for adding places is in add_link method 338 if not self.network.has_page_from_role(pg): 339 return pg.full_url 340 341 return None 342 343 344 345 def printx(self, i_str): 346 #indent and apply html complient string 347 print self.str_indent + i_str 348 349 350# Try and process the page. 351myparser = site_parser() 352#myparser.process_site('http://sql2008rs:81/dokuwiki/doku.php', 100, '.*(backlink|feed.php)') 353myparser.process_site(sys.argv[1], sys.argv[2], sys.argv[3]) 354 355# Get the hyperlinks. 356myparser.export_hyperlinks() 357 358