1*f82bbc90SS.Chekanov<?php 2*f82bbc90SS.Chekanov 3*f82bbc90SS.Chekanov/* 4*f82bbc90SS.Chekanov * Copyright (c) 2022 Sergei Chekanov 5*f82bbc90SS.Chekanov * 6*f82bbc90SS.Chekanov * This script is free software; you can redistribute it and/or modify 7*f82bbc90SS.Chekanov * it under the terms of the GNU General Public License as published by 8*f82bbc90SS.Chekanov * the Free Software Foundation; either version 2 of the License, or 9*f82bbc90SS.Chekanov * (at your option) any later version. 10*f82bbc90SS.Chekanov * 11*f82bbc90SS.Chekanov * The GNU General Public License can be found at 12*f82bbc90SS.Chekanov * http://www.gnu.org/copyleft/gpl.html. 13*f82bbc90SS.Chekanov * 14*f82bbc90SS.Chekanov * This script is distributed in the hope that it will be useful, 15*f82bbc90SS.Chekanov * but WITHOUT ANY WARRANTY; without even the implied warranty of 16*f82bbc90SS.Chekanov * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17*f82bbc90SS.Chekanov * GNU General Public License for more details. 18*f82bbc90SS.Chekanov */ 19*f82bbc90SS.Chekanov 20*f82bbc90SS.Chekanov 21*f82bbc90SS.Chekanovclass ShortDescription 22*f82bbc90SS.Chekanov{ 23*f82bbc90SS.Chekanov /** Original MediaWiki record. */ 24*f82bbc90SS.Chekanov private $record_wiki = ''; 25*f82bbc90SS.Chekanov private $record_txt = ''; 26*f82bbc90SS.Chekanov 27*f82bbc90SS.Chekanov /** 28*f82bbc90SS.Chekanov * Constructor. 29*f82bbc90SS.Chekanov * @param string $wiki Wiki Text 30*f82bbc90SS.Chekanov * @param string $txt plain text 31*f82bbc90SS.Chekanov */ 32*f82bbc90SS.Chekanov public function __construct($record_wiki, $record_txt) 33*f82bbc90SS.Chekanov { 34*f82bbc90SS.Chekanov $this->record_wiki = $record_wiki; 35*f82bbc90SS.Chekanov $this->record_txt = $record_txt; 36*f82bbc90SS.Chekanov 37*f82bbc90SS.Chekanov } 38*f82bbc90SS.Chekanov 39*f82bbc90SS.Chekanov 40*f82bbc90SS.Chekanov 41*f82bbc90SS.Chekanov 42*f82bbc90SS.Chekanov /** 43*f82bbc90SS.Chekanov * Get first sentance . 44*f82bbc90SS.Chekanov * 45*f82bbc90SS.Chekanov * @param string $txt plain text 46*f82bbc90SS.Chekanov * @return string first long sentance 47*f82bbc90SS.Chekanov */ 48*f82bbc90SS.Chekanov 49*f82bbc90SS.Chekanov protected function get_first_sentence($string) { 50*f82bbc90SS.Chekanov 51*f82bbc90SS.Chekanov $array = preg_split("/\r\n|\n|\r/", $string); 52*f82bbc90SS.Chekanov // split lines on long chunks with more than 5 words 53*f82bbc90SS.Chekanov // No more than 30 lines 54*f82bbc90SS.Chekanov $xsum=""; $n=0; 55*f82bbc90SS.Chekanov foreach ($array as &$value) { 56*f82bbc90SS.Chekanov if (str_word_count($value, 0)>4) { 57*f82bbc90SS.Chekanov $xsum=$xsum." ". trim($value); 58*f82bbc90SS.Chekanov $n=$n+1; 59*f82bbc90SS.Chekanov if ($n>30) break; 60*f82bbc90SS.Chekanov } } 61*f82bbc90SS.Chekanov 62*f82bbc90SS.Chekanov $xsum=trim($xsum); 63*f82bbc90SS.Chekanov //print("OK=".$xsum); 64*f82bbc90SS.Chekanov 65*f82bbc90SS.Chekanov // split into sentances 66*f82bbc90SS.Chekanov $sentences = preg_split('/(?<=[.?!])\s+(?=[a-z])/i', $xsum); 67*f82bbc90SS.Chekanov //print_r($sentences); 68*f82bbc90SS.Chekanov 69*f82bbc90SS.Chekanov //take a sentance with at least 3 words 70*f82bbc90SS.Chekanov $xsum=""; 71*f82bbc90SS.Chekanov foreach ($sentences as &$value) { 72*f82bbc90SS.Chekanov if (str_word_count($value, 0)>3) { 73*f82bbc90SS.Chekanov $xsum=$xsum." ". trim($value); 74*f82bbc90SS.Chekanov break; 75*f82bbc90SS.Chekanov } } 76*f82bbc90SS.Chekanov 77*f82bbc90SS.Chekanov return trim($xsum); 78*f82bbc90SS.Chekanov} 79*f82bbc90SS.Chekanov 80*f82bbc90SS.Chekanov // trim and remove full dot. 81*f82bbc90SS.Chekanov protected function mytrim($string){ 82*f82bbc90SS.Chekanov $string=trim($string); 83*f82bbc90SS.Chekanov $string = rtrim($string,'.'); 84*f82bbc90SS.Chekanov return trim($string); 85*f82bbc90SS.Chekanov } 86*f82bbc90SS.Chekanov 87*f82bbc90SS.Chekanov /** 88*f82bbc90SS.Chekanov * Get short description of the article. 89*f82bbc90SS.Chekanov * 90*f82bbc90SS.Chekanov * @return string short description 91*f82bbc90SS.Chekanov */ 92*f82bbc90SS.Chekanov 93*f82bbc90SS.Chekanov public function getDescription() { 94*f82bbc90SS.Chekanov 95*f82bbc90SS.Chekanov 96*f82bbc90SS.Chekanov $wiki=$this->record_wiki; 97*f82bbc90SS.Chekanov $txt=$this->record_txt; 98*f82bbc90SS.Chekanov 99*f82bbc90SS.Chekanov $description=""; 100*f82bbc90SS.Chekanov 101*f82bbc90SS.Chekanov // EnHub style 102*f82bbc90SS.Chekanov if (preg_match('/{{abstract\|(.*?)}}/i', $wiki, $match) == 1) { 103*f82bbc90SS.Chekanov $description=$match[1]; 104*f82bbc90SS.Chekanov } 105*f82bbc90SS.Chekanov if (str_word_count($description, 0)>3) return $this->mytrim($description); 106*f82bbc90SS.Chekanov 107*f82bbc90SS.Chekanov // Wikipedia style 108*f82bbc90SS.Chekanov if (preg_match('/{{short description\|(.*?)}}/i', $wiki, $match) == 1) { 109*f82bbc90SS.Chekanov $description=$match[1]; 110*f82bbc90SS.Chekanov } 111*f82bbc90SS.Chekanov if (str_word_count($description, 0)>3) return $this->mytrim($description); 112*f82bbc90SS.Chekanov 113*f82bbc90SS.Chekanov 114*f82bbc90SS.Chekanov // if nothing is found in templates, use plain text. 115*f82bbc90SS.Chekanov $description=$this->get_first_sentence($txt); 116*f82bbc90SS.Chekanov 117*f82bbc90SS.Chekanov return $this->mytrim($description); 118*f82bbc90SS.Chekanov 119*f82bbc90SS.Chekanov} 120*f82bbc90SS.Chekanov 121*f82bbc90SS.Chekanov 122*f82bbc90SS.Chekanov} // end class 123*f82bbc90SS.Chekanov 124*f82bbc90SS.Chekanov 125*f82bbc90SS.Chekanov 126*f82bbc90SS.Chekanov// some debugging 127*f82bbc90SS.Chekanov//$wiki="Test {{Abstract|EncycloReader is a web application designed to search multiple online encyclopedias at once}} 128*f82bbc90SS.Chekanov//'''EncycloReader''' is a web application designed to search multiple online encyclopedias at once and read articles in a unified representation {{Author|S.V.Chekanov}}"; 129*f82bbc90SS.Chekanov//$txt="{shs = ss}\n || \n Test. This is a web application designed to search multiple online encyclopedias at once. 130*f82bbc90SS.Chekanov//EncycloReader is a web application designed to search multiple online encyclopedias at once and read articles in a unified representation"; 131*f82bbc90SS.Chekanov//$DESC = new ShortDescription($wiki, $txt); 132*f82bbc90SS.Chekanov//print($DESC->getDescription()); 133*f82bbc90SS.Chekanov//print_r(get_first_sentence($txt)); 134*f82bbc90SS.Chekanov 135*f82bbc90SS.Chekanov 136