1<?php
2if(!defined('DOKU_INC')) die();
3if(!defined('DOKU_PLUGIN')) define('DOKU_PLUGIN',DOKU_INC.'lib/plugins/');
4require_once(DOKU_PLUGIN.'action.php');
5
6class action_plugin_robot404 extends DokuWiki_Action_Plugin {
7  function getInfo(){ return conf_loadfile(dirname(__FILE__).'/info.txt'); }
8	  function register($contr){
9    if(static::client_isrobot()){
10      $contr->register_hook('ACTION_ACT_PREPROCESS','BEFORE',$this,'handle_action');
11    }
12    else{
13      #In case our robot detection does not capture a robot, also hook to headers so we can add noindex,nofollow to these pages.
14      $contr->register_hook('TPL_METAHEADER_OUTPUT', 'BEFORE',  $this, 'handle_header');
15      $contr->register_hook('ACTION_HEADERS_SEND', 'BEFORE',  $this, 'handle_header');
16    }
17   }
18   function handle_header(&$e){
19    global $ACT;
20    #looking at $ACT is not enough (e.g, if 'register' action is disabled, ACT becomes 'show'. We also check the original 'do' parameter.)
21    if(!$this->ishiddenpage()&&!$this->isdisabledaction($ACT)&&!$this->isdisabledaction($_REQUEST['do'])) return;
22    if ($e->name == 'TPL_METAHEADER_OUTPUT'){
23    $found=false;
24    foreach($e->data['meta']?:[] as $key=>$entry){
25      if($entry['name']=='robots'){
26        $content=explode(',',$entry['content']); #e.g., convert 'index,follow' to an array.
27        $content=array_diff($content,['index','follow']); #remove 'index' and 'follow' from the array.
28        $content[]='noindex'; #add noindex and nofollow
29        $content[]='nofollow';
30        #$content[]='addedbyrobot404'; #used for debugging.
31        $entry['content']=implode(',',$content);
32        $e->data['meta'][$key]=$entry;
33        $found=true;
34      }
35    }
36    if(!$found) $e->data['meta'][]=['name'=>'robots','content'=>'noindex,nofollow'];
37    }
38    elseif ($e->name == 'ACTION_HEADERS_SEND')
39        $e->data[] = 'X-Robots-Tag: noindex,nofollow';
40   }
41
42  static function client_isrobot(){
43    if(isset($_REQUEST['isrobot404'])&&$_REQUEST['isrobot404']) return true;
44    if(!isset($_SERVER['HTTP_USER_AGENT'])||!$_SERVER['HTTP_USER_AGENT']) return false;
45    if(preg_match('#(Google|msnbot|Yahoo|Rambler|AbachoBOT|accoona|AcioRobot|ASPSeek|CocoCrawler|Dumbot|FAST-WebCrawler|GeonaBot|Gigabot|Lycos|MSRBOT|Scooter|AltaVista|IDBot|eStyle|Scrubby|BaiDuSpider|Baiduspider)#i',$_SERVER['HTTP_USER_AGENT'],$m)) return strtolower($m[1]);
46          #also use the list in: https://stackoverflow.com/questions/677419/how-to-detect-search-engine-bots-with-php
47          if(preg_match('#(bot|crawl|slurp|spider|mediapartners)#i',$_SERVER['HTTP_USER_AGENT'],$m)) return strtolower($m[1]);
48    return false;
49  }
50
51   function ishiddenpage(){
52    global $ID;
53    return $this->getConf('hiddenpages') && isHiddenPage($ID);
54   }
55   function isdisabledaction($action){
56    if(is_array($action)){
57      foreach($action as $act){
58        if($this->isdisabledaction($act)) return true;
59      }
60      return false;
61    }
62    global $conf;
63    $actions=$conf['disableactions'];
64    if(is_string($actions)) $actions=explode(',',$actions);
65    if(in_array($action,$actions)) return true;
66
67    $actions=$this->getConf('disableactions');
68    if(is_string($actions)) $actions=explode(',',$actions);
69    if(in_array($action,$actions)) return true;
70
71    return false;
72   }
73
74  function handle_action(&$e){
75    if(!$this->ishiddenpage() && !$this->isdisabledaction($e->data)) return; #if !hidden and !disabledaction, nothing to do. return.
76    header('HTTP/1.0 404 Not Found');
77    echo $this->ishiddenpage()?"Hidden page for robots.":"Disallowed action for robots: ".esc($e->data);
78    die();
79  }
80}
81