1<?php 2 3/* 4 * Copyright (c) 2022 Sergei Chekanov 5 * 6 * This script is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * The GNU General Public License can be found at 12 * http://www.gnu.org/copyleft/gpl.html. 13 * 14 * This script is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20 21class ShortDescription 22{ 23 /** Original MediaWiki record. */ 24 private $record_wiki = ''; 25 private $record_txt = ''; 26 27 /** 28 * Constructor. 29 * @param string $wiki Wiki Text 30 * @param string $txt plain text 31 */ 32 public function __construct($record_wiki, $record_txt) 33 { 34 $this->record_wiki = $record_wiki; 35 $this->record_txt = $record_txt; 36 37 } 38 39 40 41 42 /** 43 * Get first sentance . 44 * 45 * @param string $txt plain text 46 * @return string first long sentance 47 */ 48 49 protected function get_first_sentence($string) { 50 51 $array = preg_split("/\r\n|\n|\r/", $string); 52 // split lines on long chunks with more than 5 words 53 // No more than 30 lines 54 $xsum=""; $n=0; 55 foreach ($array as &$value) { 56 if (str_word_count($value, 0)>4) { 57 $xsum=$xsum." ". trim($value); 58 $n=$n+1; 59 if ($n>30) break; 60 } } 61 62 $xsum=trim($xsum); 63 //print("OK=".$xsum); 64 65 // split into sentances 66 $sentences = preg_split('/(?<=[.?!])\s+(?=[a-z])/i', $xsum); 67 //print_r($sentences); 68 69 //take a sentance with at least 3 words 70 $xsum=""; 71 foreach ($sentences as &$value) { 72 if (str_word_count($value, 0)>3) { 73 $xsum=$xsum." ". trim($value); 74 break; 75 } } 76 77 return trim($xsum); 78} 79 80 // trim and remove full dot. 81 protected function mytrim($string){ 82 $string=trim($string); 83 $string = rtrim($string,'.'); 84 return trim($string); 85 } 86 87 /** 88 * Get short description of the article. 89 * 90 * @return string short description 91 */ 92 93 public function getDescription() { 94 95 96 $wiki=$this->record_wiki; 97 $txt=$this->record_txt; 98 99 $description=""; 100 101 // EnHub style 102 if (preg_match('/{{abstract\|(.*?)}}/i', $wiki, $match) == 1) { 103 $description=$match[1]; 104 } 105 if (str_word_count($description, 0)>3) return $this->mytrim($description); 106 107 // Wikipedia style 108 if (preg_match('/{{short description\|(.*?)}}/i', $wiki, $match) == 1) { 109 $description=$match[1]; 110 } 111 if (str_word_count($description, 0)>3) return $this->mytrim($description); 112 113 114 // if nothing is found in templates, use plain text. 115 $description=$this->get_first_sentence($txt); 116 117 return $this->mytrim($description); 118 119} 120 121 122} // end class 123 124 125 126// some debugging 127//$wiki="Test {{Abstract|EncycloReader is a web application designed to search multiple online encyclopedias at once}} 128//'''EncycloReader''' is a web application designed to search multiple online encyclopedias at once and read articles in a unified representation {{Author|S.V.Chekanov}}"; 129//$txt="{shs = ss}\n || \n Test. This is a web application designed to search multiple online encyclopedias at once. 130//EncycloReader is a web application designed to search multiple online encyclopedias at once and read articles in a unified representation"; 131//$DESC = new ShortDescription($wiki, $txt); 132//print($DESC->getDescription()); 133//print_r(get_first_sentence($txt)); 134 135 136