Diff: STRATO-apps/wordpress_03/app/wp-content/plugins/aimogen-pro/res/rake-php-plus/src/RakePlus.php

Keine Baseline-Datei – Diff nur gegen leer.
Zur Liste
1 -
1 + <?php
2 +
3 + namespace DonatelloZa\RakePlus;
4 +
5 + defined('ABSPATH') or die();
6 + use InvalidArgumentException;
7 +
8 + class RakePlus
9 + {
10 + /** @var string */
11 + protected $language = 'en_US';
12 +
13 + /** @var string */
14 + protected $language_file = "";
15 +
16 + /** @var string|null */
17 + private $pattern = null;
18 +
19 + /** @var array */
20 + private $phrase_scores = [];
21 +
22 + /** @var int */
23 + private $min_length = 0;
24 +
25 + /** @var bool */
26 + private $filter_numerics = true;
27 +
28 + /** @var string */
29 + private $sentence_regex;
30 +
31 + /** @var string */
32 + private $line_terminator;
33 +
34 + /** @var bool */
35 + public $mb_support = false;
36 +
37 + /** @var LangParseOptions */
38 + public $parseOptions;
39 +
40 + const ORDER_ASC = 'asc';
41 +
42 + const ORDER_DESC = 'desc';
43 +
44 + /**
45 + * RakePlus constructor. Instantiates RakePlus and extracts
46 + * the key phrases from the text if supplied.
47 + *
48 + * If $stopwords is a string the method will:
49 + *
50 + * 1) Determine if it is has a .pattern or .php extension and if
51 + * so will attempt to load the stopwords from the specified path
52 + * and filename.
53 + * 2) If it does not have a .pattern or .php extension, it will assume
54 + * that a language string was specified and will then attempt to
55 + * read the stopwords from lang/xxxx.pattern or lang/xxxx.php, where
56 + * xxxx is the language string (default: en_US)
57 + *
58 + * If $stopwords os an array it will simply use the array of stopwords
59 + * as provided.
60 + *
61 + * If $stopwords is a derived instance of StopwordAbstract it will simply
62 + * retrieve the stopwords from the instance.
63 + *
64 + * @param string|null $text Text to turn into keywords/phrases.
65 + * @param AbstractStopwordProvider|string|array $stopwords Stopwords/language to use.
66 + * @param int $phrase_min_length Minimum keyword/phrase length.
67 + * @param bool $filter_numerics Filter out numeric numbers.
68 + * @param null|LangParseOptions $parseOptions Additional text parsing options, see:
69 + * @LangParseOptions
70 + */
71 + public function __construct($text = null, $stopwords = 'en_US', $phrase_min_length = 0, $filter_numerics = true,
72 + $parseOptions = null)
73 + {
74 + $this->mb_support = extension_loaded('mbstring');
75 +
76 + $this->setMinLength($phrase_min_length);
77 + $this->setFilterNumerics($filter_numerics);
78 +
79 + if ($parseOptions === null) {
80 + $this->parseOptions = LangParseOptions::create($stopwords);
81 + } else if ($parseOptions instanceof ILangParseOptions) {
82 + $this->parseOptions = $parseOptions;
83 + } else {
84 + throw new InvalidArgumentException("The \$parseOptions argument must be an instance of ILangParseOptions.");
85 + }
86 +
87 + $this->sentence_regex = $this->parseOptions->getSentenceRegex();
88 + $this->line_terminator = $this->parseOptions->getLineTerminator();
89 +
90 + if (!is_null($text)) {
91 + $this->extract($text, $stopwords);
92 + }
93 + }
94 +
95 + /**
96 + * Instantiates a RakePlus instance and extracts
97 + * the key phrases from the text.
98 + *
99 + * If $stopwords is a string the method will:
100 + *
101 + * 1) Determine if it is has a .pattern or .php extension and if
102 + * so will attempt to load the stopwords from the specified path
103 + * and filename.
104 + * 2) If it does not have a .pattern or .php extension, it will assume
105 + * that a language string was specified and will then attempt to
106 + * read the stopwords from lang/xxxx.pattern or lang/xxxx.php, where
107 + * xxxx is the language string (default: en_US)
108 + *
109 + * If $stopwords os an array it will simply use the array of stopwords
110 + * as provided.
111 + *
112 + * If $stopwords is a derived instance of StopwordAbstract it will simply
113 + * retrieve the stopwords from the instance.
114 + *
115 + * @param string|null $text Text to turn into keywords/phrases.
116 + * @param AbstractStopwordProvider|string|array $stopwords Stopwords to use.
117 + * @param int $phrase_min_length Minimum keyword/phrase length.
118 + * @param bool $filter_numerics Filter out numeric numbers.
119 + * @param null|LangParseOptions $parseOptions Additional text parsing options, see:
120 + * @LangParseOptions
121 + *
122 + * @return RakePlus
123 + */
124 + public static function create($text, $stopwords = 'en_US', $phrase_min_length = 0, $filter_numerics = true,
125 + $parseOptions = null)
126 + {
127 + return (new self($text, $stopwords, $phrase_min_length, $filter_numerics, $parseOptions));
128 + }
129 +
130 + /**
131 + * Extracts the key phrases from the text.
132 + *
133 + * If $stopwords is a string the method will:
134 + *
135 + * 1) Determine if it is has a .pattern or .php extension and if
136 + * so will attempt to load the stopwords from the specified path
137 + * and filename.
138 + * 2) If it does not have a .pattern or .php extension, it will assume
139 + * that a language string was specified and will then attempt to
140 + * read the stopwords from lang/xxxx.pattern or lang/xxxx.php, where
141 + * xxxx is the language string (default: en_US)
142 + *
143 + * If $stopwords os an array it will simply use the array of stopwords
144 + * as provided.
145 + *
146 + * If $stopwords is a derived instance of StopwordAbstract it will simply
147 + * retrieve the stopwords from the instance.
148 + *
149 + * @param string $text
150 + * @param AbstractStopwordProvider|string|array $stopwords
151 + *
152 + * @return RakePlus
153 + */
154 + public function extract($text, $stopwords = 'en_US')
155 + {
156 + if ($text != '') {
157 + if (is_array($stopwords)) {
158 + $this->pattern = StopwordArray::create($stopwords)->pattern();
159 + } else if (is_string($stopwords)) {
160 + if (is_null($this->pattern) || ($this->language != $stopwords)) {
161 + $extension = mb_strtolower(pathinfo($stopwords, PATHINFO_EXTENSION));
162 + if (empty($extension)) {
163 + // First try the .pattern file
164 + $this->language_file = StopwordsPatternFile::languageFile($stopwords);
165 + if (file_exists($this->language_file)) {
166 + $this->pattern = StopwordsPatternFile::create($this->language_file)->pattern();
167 + } else {
168 + $this->language_file = StopwordsPHP::languageFile($stopwords);
169 + $this->pattern = StopwordsPHP::create($this->language_file)->pattern();
170 + }
171 + $this->language = $stopwords;
172 + } else if ($extension == 'pattern') {
173 + $this->language = $stopwords;
174 + $this->language_file = $stopwords;
175 + $this->pattern = StopwordsPatternFile::create($this->language_file)->pattern();
176 + } else if ($extension == 'php') {
177 + $language_file = $stopwords;
178 + $this->language = $stopwords;
179 + $this->language_file = $language_file;
180 + $this->pattern = StopwordsPHP::create($this->language_file)->pattern();
181 + }
182 + }
183 + } elseif (is_subclass_of($stopwords, 'DonatelloZa\RakePlus\AbstractStopwordProvider')) {
184 + $this->pattern = $stopwords->pattern();
185 + } else {
186 + throw new InvalidArgumentException('Invalid stopwords list provided for RakePlus.');
187 + }
188 +
189 + if ($this->mb_support) {
190 + $sentences = $this->splitSentencesMb($text);
191 + $phrases = $this->getPhrasesMb($sentences, $this->pattern);
192 + } else {
193 + $sentences = $this->splitSentences($text);
194 + $phrases = $this->getPhrases($sentences, $this->pattern);
195 + }
196 + $word_scores = $this->calcWordScores($phrases);
197 + $this->phrase_scores = $this->calcPhraseScores($phrases, $word_scores);
198 + }
199 +
200 + return $this;
201 + }
202 +
203 + /**
204 + * Returns the extracted phrases.
205 + *
206 + * @return array
207 + */
208 + public function get()
209 + {
210 + return array_keys($this->phrase_scores);
211 + }
212 +
213 + /**
214 + * Returns the phrases and a score for each of
215 + * the phrases as an associative array.
216 + *
217 + * @return array
218 + */
219 + public function scores()
220 + {
221 + return $this->phrase_scores;
222 + }
223 +
224 + /**
225 + * Returns only the unique keywords within the
226 + * phrases instead of the full phrases itself.
227 + *
228 + * @return array
229 + */
230 + public function keywords()
231 + {
232 + $keywords = [];
233 + $phrases = $this->get();
234 +
235 + foreach ($phrases as $phrase) {
236 + $words = explode(' ', $phrase);
237 + foreach ($words as $word) {
238 + // This may look weird to the casual observer
239 + // but we do this since PHP will convert string
240 + // array keys that look like integers to actual
241 + // integers. This may cause problems further
242 + // down the line when a developer attempts to
243 + // append arrays to one another and one of them
244 + // have a mix of integer and string keys.
245 + if (!$this->filter_numerics || ($this->filter_numerics && !is_numeric($word))) {
246 + if ($this->min_length === 0 || mb_strlen($word) >= $this->min_length) {
247 + $keywords[$word] = $word;
248 + }
249 + }
250 + }
251 + }
252 +
253 + return array_values($keywords);
254 + }
255 +
256 + /**
257 + * Sorts the phrases by score, use 'asc' or 'desc' to specify a
258 + * sort order.
259 + *
260 + * @param string $order Default is 'asc'
261 + *
262 + * @return $this
263 + */
264 + public function sortByScore($order = self::ORDER_ASC)
265 + {
266 + if ($order == self::ORDER_DESC) {
267 + arsort($this->phrase_scores);
268 + } else {
269 + asort($this->phrase_scores);
270 + }
271 +
272 + return $this;
273 + }
274 +
275 + /**
276 + * Sorts the phrases alphabetically, use 'asc' or 'desc' to specify a
277 + * sort order.
278 + *
279 + * @param string $order Default is 'asc'
280 + *
281 + * @return $this
282 + */
283 + public function sort($order = self::ORDER_ASC)
284 + {
285 + if ($order == self::ORDER_DESC) {
286 + krsort($this->phrase_scores);
287 + } else {
288 + ksort($this->phrase_scores);
289 + }
290 +
291 + return $this;
292 + }
293 +
294 + /**
295 + * Returns the current language being used.
296 + *
297 + * @return string
298 + */
299 + public function language()
300 + {
301 + return $this->language;
302 + }
303 +
304 + /**
305 + * Returns the language file that was loaded. Will
306 + * be null if no file is loaded.
307 + *
308 + * @return string|null
309 + */
310 + public function languageFile()
311 + {
312 + return $this->language_file;
313 + }
314 +
315 + /**
316 + * Splits the text into an array of sentences.
317 + *
318 + * @param string $text
319 + *
320 + * @return array
321 + */
322 + private function splitSentences($text)
323 + {
324 + return preg_split('/' . $this->sentence_regex . '/',
325 + preg_replace('/' . $this->line_terminator . '/', ' ', $text));
326 + }
327 +
328 + /**
329 + * Splits the text into an array of sentences. Uses mb_* functions.
330 + *
331 + * @param string $text
332 + *
333 + * @return array
334 + */
335 + private function splitSentencesMb($text)
336 + {
337 + return mb_split($this->sentence_regex,
338 + mb_ereg_replace($this->line_terminator, ' ', $text));
339 + }
340 +
341 + /**
342 + * Split sentences into phrases by using the stopwords.
343 + *
344 + * @param array $sentences
345 + * @param string $pattern
346 + *
347 + * @return array
348 + */
349 + private function getPhrases(array $sentences, $pattern)
350 + {
351 + $results = [];
352 +
353 + foreach ($sentences as $sentence) {
354 + $phrases_temp = preg_replace($pattern, '|', $sentence);
355 + $phrases = explode('|', $phrases_temp);
356 + foreach ($phrases as $phrase) {
357 + $phrase = mb_strtolower(trim($phrase));
358 + if (!empty($phrase)) {
359 + if (!$this->filter_numerics || ($this->filter_numerics && !is_numeric($phrase))) {
360 + if ($this->min_length === 0 || mb_strlen($phrase) >= $this->min_length) {
361 + $results[] = $phrase;
362 + }
363 + }
364 + }
365 + }
366 + }
367 +
368 + return $results;
369 + }
370 +
371 + /**
372 + * Split sentences into phrases by using the stopwords. Makes use of
373 + * PHP's mb_* functions.
374 + *
375 + * @param array $sentences
376 + * @param string $pattern
377 + *
378 + * @return array
379 + */
380 + private function getPhrasesMb(array $sentences, $pattern)
381 + {
382 + $results = [];
383 +
384 + foreach ($sentences as $sentence) {
385 + $phrases_temp = mb_eregi_replace($pattern, '|', $sentence);
386 + $phrases = explode('|', $phrases_temp);
387 + foreach ($phrases as $phrase) {
388 + $phrase = mb_strtolower(preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $phrase));
389 + if (!empty($phrase)) {
390 + if (!$this->filter_numerics || ($this->filter_numerics && !is_numeric($phrase))) {
391 + if ($this->min_length === 0 || mb_strlen($phrase) >= $this->min_length) {
392 + $results[] = $phrase;
393 + }
394 + }
395 + }
396 + }
397 + }
398 +
399 + return $results;
400 + }
401 +
402 + /**
403 + * Calculate a score for each word.
404 + *
405 + * @param array $phrases
406 + *
407 + * @return array
408 + */
409 + private function calcWordScores($phrases)
410 + {
411 + $frequencies = [];
412 + $degrees = [];
413 +
414 + foreach ($phrases as $phrase) {
415 + $words = $this->splitPhraseIntoWords($phrase);
416 + $words_count = count($words);
417 + $words_degree = $words_count - 1;
418 +
419 + foreach ($words as $w) {
420 + $frequencies[$w] = (isset($frequencies[$w])) ? $frequencies[$w] : 0;
421 + $frequencies[$w] += 1;
422 + $degrees[$w] = (isset($degrees[$w])) ? $degrees[$w] : 0;
423 + $degrees[$w] += $words_degree;
424 + }
425 + }
426 +
427 + foreach ($frequencies as $word => $freq) {
428 + $degrees[$word] += $freq;
429 + }
430 +
431 + $scores = [];
432 + foreach ($frequencies as $word => $freq) {
433 + $scores[$word] = (isset($scores[$word])) ? $scores[$word] : 0;
434 + $scores[$word] = $degrees[$word] / (float)$freq;
435 + }
436 +
437 + return $scores;
438 + }
439 +
440 + /**
441 + * Calculate score for each phrase by word scores.
442 + *
443 + * @param array $phrases
444 + * @param array $scores
445 + *
446 + * @return array
447 + */
448 + private function calcPhraseScores($phrases, $scores)
449 + {
450 + $keywords = [];
451 +
452 + foreach ($phrases as $phrase) {
453 + $keywords[$phrase] = (isset($keywords[$phrase])) ? $keywords[$phrase] : 0;
454 + $words = $this->splitPhraseIntoWords($phrase);
455 + $score = 0;
456 +
457 + foreach ($words as $word) {
458 + $score += $scores[$word];
459 + }
460 +
461 + $keywords[$phrase] = $score;
462 + }
463 +
464 + return $keywords;
465 + }
466 +
467 + /**
468 + * Split a phrase into multiple words and returns them
469 + * as an array.
470 + *
471 + * @param string $phrase
472 + *
473 + * @return array
474 + */
475 + private function splitPhraseIntoWords($phrase)
476 + {
477 + return array_filter(preg_split('/\W+/u', $phrase, -1, PREG_SPLIT_NO_EMPTY), function ($word) {
478 + return !is_numeric($word);
479 + });
480 + }
481 +
482 + /**
483 + * Returns the minimum number of letters each phrase/keyword must have.
484 + *
485 + * @return int
486 + */
487 + public function getMinLength()
488 + {
489 + return $this->min_length;
490 + }
491 +
492 + /**
493 + * Sets the minimum number of letters each phrase/keyword must have.
494 + *
495 + * @param int $min_length
496 + *
497 + * @return RakePlus
498 + */
499 + public function setMinLength($min_length)
500 + {
501 + if ((int)$min_length < 0) {
502 + throw new InvalidArgumentException('Minimum phrase length must be greater than or equal to 0.');
503 + }
504 +
505 + $this->min_length = (int)$min_length;
506 + return $this;
507 + }
508 +
509 + /**
510 + * Sets whether numeric-only phrases/keywords should be filtered
511 + * out or not.
512 + *
513 + * @param $filter_numerics
514 + *
515 + * @return RakePlus
516 + */
517 + public function setFilterNumerics($filter_numerics = true)
518 + {
519 + $this->filter_numerics = $filter_numerics;
520 + return $this;
521 + }
522 +
523 + /**
524 + * Returns whether numeric-only phrases/keywords will be filtered
525 + * out or not.
526 + *
527 + * @return bool
528 + */
529 + public function getFilterNumerics()
530 + {
531 + return $this->filter_numerics;
532 + }
533 + }
534 +