Diff: STRATO-apps/wordpress_03/app/wp-content/plugins/aimogen-pro/res/rake-php-plus/src/RakePlus.php
Keine Baseline-Datei – Diff nur gegen leer.
1
-
1
+
<?php
2
+
3
+
namespace DonatelloZa\RakePlus;
4
+
5
+
defined('ABSPATH') or die();
6
+
use InvalidArgumentException;
7
+
8
+
class RakePlus
9
+
{
10
+
/** @var string */
11
+
protected $language = 'en_US';
12
+
13
+
/** @var string */
14
+
protected $language_file = "";
15
+
16
+
/** @var string|null */
17
+
private $pattern = null;
18
+
19
+
/** @var array */
20
+
private $phrase_scores = [];
21
+
22
+
/** @var int */
23
+
private $min_length = 0;
24
+
25
+
/** @var bool */
26
+
private $filter_numerics = true;
27
+
28
+
/** @var string */
29
+
private $sentence_regex;
30
+
31
+
/** @var string */
32
+
private $line_terminator;
33
+
34
+
/** @var bool */
35
+
public $mb_support = false;
36
+
37
+
/** @var LangParseOptions */
38
+
public $parseOptions;
39
+
40
+
const ORDER_ASC = 'asc';
41
+
42
+
const ORDER_DESC = 'desc';
43
+
44
+
/**
45
+
* RakePlus constructor. Instantiates RakePlus and extracts
46
+
* the key phrases from the text if supplied.
47
+
*
48
+
* If $stopwords is a string the method will:
49
+
*
50
+
* 1) Determine if it is has a .pattern or .php extension and if
51
+
* so will attempt to load the stopwords from the specified path
52
+
* and filename.
53
+
* 2) If it does not have a .pattern or .php extension, it will assume
54
+
* that a language string was specified and will then attempt to
55
+
* read the stopwords from lang/xxxx.pattern or lang/xxxx.php, where
56
+
* xxxx is the language string (default: en_US)
57
+
*
58
+
* If $stopwords os an array it will simply use the array of stopwords
59
+
* as provided.
60
+
*
61
+
* If $stopwords is a derived instance of StopwordAbstract it will simply
62
+
* retrieve the stopwords from the instance.
63
+
*
64
+
* @param string|null $text Text to turn into keywords/phrases.
65
+
* @param AbstractStopwordProvider|string|array $stopwords Stopwords/language to use.
66
+
* @param int $phrase_min_length Minimum keyword/phrase length.
67
+
* @param bool $filter_numerics Filter out numeric numbers.
68
+
* @param null|LangParseOptions $parseOptions Additional text parsing options, see:
69
+
* @LangParseOptions
70
+
*/
71
+
public function __construct($text = null, $stopwords = 'en_US', $phrase_min_length = 0, $filter_numerics = true,
72
+
$parseOptions = null)
73
+
{
74
+
$this->mb_support = extension_loaded('mbstring');
75
+
76
+
$this->setMinLength($phrase_min_length);
77
+
$this->setFilterNumerics($filter_numerics);
78
+
79
+
if ($parseOptions === null) {
80
+
$this->parseOptions = LangParseOptions::create($stopwords);
81
+
} else if ($parseOptions instanceof ILangParseOptions) {
82
+
$this->parseOptions = $parseOptions;
83
+
} else {
84
+
throw new InvalidArgumentException("The \$parseOptions argument must be an instance of ILangParseOptions.");
85
+
}
86
+
87
+
$this->sentence_regex = $this->parseOptions->getSentenceRegex();
88
+
$this->line_terminator = $this->parseOptions->getLineTerminator();
89
+
90
+
if (!is_null($text)) {
91
+
$this->extract($text, $stopwords);
92
+
}
93
+
}
94
+
95
+
/**
96
+
* Instantiates a RakePlus instance and extracts
97
+
* the key phrases from the text.
98
+
*
99
+
* If $stopwords is a string the method will:
100
+
*
101
+
* 1) Determine if it is has a .pattern or .php extension and if
102
+
* so will attempt to load the stopwords from the specified path
103
+
* and filename.
104
+
* 2) If it does not have a .pattern or .php extension, it will assume
105
+
* that a language string was specified and will then attempt to
106
+
* read the stopwords from lang/xxxx.pattern or lang/xxxx.php, where
107
+
* xxxx is the language string (default: en_US)
108
+
*
109
+
* If $stopwords os an array it will simply use the array of stopwords
110
+
* as provided.
111
+
*
112
+
* If $stopwords is a derived instance of StopwordAbstract it will simply
113
+
* retrieve the stopwords from the instance.
114
+
*
115
+
* @param string|null $text Text to turn into keywords/phrases.
116
+
* @param AbstractStopwordProvider|string|array $stopwords Stopwords to use.
117
+
* @param int $phrase_min_length Minimum keyword/phrase length.
118
+
* @param bool $filter_numerics Filter out numeric numbers.
119
+
* @param null|LangParseOptions $parseOptions Additional text parsing options, see:
120
+
* @LangParseOptions
121
+
*
122
+
* @return RakePlus
123
+
*/
124
+
public static function create($text, $stopwords = 'en_US', $phrase_min_length = 0, $filter_numerics = true,
125
+
$parseOptions = null)
126
+
{
127
+
return (new self($text, $stopwords, $phrase_min_length, $filter_numerics, $parseOptions));
128
+
}
129
+
130
+
/**
131
+
* Extracts the key phrases from the text.
132
+
*
133
+
* If $stopwords is a string the method will:
134
+
*
135
+
* 1) Determine if it is has a .pattern or .php extension and if
136
+
* so will attempt to load the stopwords from the specified path
137
+
* and filename.
138
+
* 2) If it does not have a .pattern or .php extension, it will assume
139
+
* that a language string was specified and will then attempt to
140
+
* read the stopwords from lang/xxxx.pattern or lang/xxxx.php, where
141
+
* xxxx is the language string (default: en_US)
142
+
*
143
+
* If $stopwords os an array it will simply use the array of stopwords
144
+
* as provided.
145
+
*
146
+
* If $stopwords is a derived instance of StopwordAbstract it will simply
147
+
* retrieve the stopwords from the instance.
148
+
*
149
+
* @param string $text
150
+
* @param AbstractStopwordProvider|string|array $stopwords
151
+
*
152
+
* @return RakePlus
153
+
*/
154
+
public function extract($text, $stopwords = 'en_US')
155
+
{
156
+
if ($text != '') {
157
+
if (is_array($stopwords)) {
158
+
$this->pattern = StopwordArray::create($stopwords)->pattern();
159
+
} else if (is_string($stopwords)) {
160
+
if (is_null($this->pattern) || ($this->language != $stopwords)) {
161
+
$extension = mb_strtolower(pathinfo($stopwords, PATHINFO_EXTENSION));
162
+
if (empty($extension)) {
163
+
// First try the .pattern file
164
+
$this->language_file = StopwordsPatternFile::languageFile($stopwords);
165
+
if (file_exists($this->language_file)) {
166
+
$this->pattern = StopwordsPatternFile::create($this->language_file)->pattern();
167
+
} else {
168
+
$this->language_file = StopwordsPHP::languageFile($stopwords);
169
+
$this->pattern = StopwordsPHP::create($this->language_file)->pattern();
170
+
}
171
+
$this->language = $stopwords;
172
+
} else if ($extension == 'pattern') {
173
+
$this->language = $stopwords;
174
+
$this->language_file = $stopwords;
175
+
$this->pattern = StopwordsPatternFile::create($this->language_file)->pattern();
176
+
} else if ($extension == 'php') {
177
+
$language_file = $stopwords;
178
+
$this->language = $stopwords;
179
+
$this->language_file = $language_file;
180
+
$this->pattern = StopwordsPHP::create($this->language_file)->pattern();
181
+
}
182
+
}
183
+
} elseif (is_subclass_of($stopwords, 'DonatelloZa\RakePlus\AbstractStopwordProvider')) {
184
+
$this->pattern = $stopwords->pattern();
185
+
} else {
186
+
throw new InvalidArgumentException('Invalid stopwords list provided for RakePlus.');
187
+
}
188
+
189
+
if ($this->mb_support) {
190
+
$sentences = $this->splitSentencesMb($text);
191
+
$phrases = $this->getPhrasesMb($sentences, $this->pattern);
192
+
} else {
193
+
$sentences = $this->splitSentences($text);
194
+
$phrases = $this->getPhrases($sentences, $this->pattern);
195
+
}
196
+
$word_scores = $this->calcWordScores($phrases);
197
+
$this->phrase_scores = $this->calcPhraseScores($phrases, $word_scores);
198
+
}
199
+
200
+
return $this;
201
+
}
202
+
203
+
/**
204
+
* Returns the extracted phrases.
205
+
*
206
+
* @return array
207
+
*/
208
+
public function get()
209
+
{
210
+
return array_keys($this->phrase_scores);
211
+
}
212
+
213
+
/**
214
+
* Returns the phrases and a score for each of
215
+
* the phrases as an associative array.
216
+
*
217
+
* @return array
218
+
*/
219
+
public function scores()
220
+
{
221
+
return $this->phrase_scores;
222
+
}
223
+
224
+
/**
225
+
* Returns only the unique keywords within the
226
+
* phrases instead of the full phrases itself.
227
+
*
228
+
* @return array
229
+
*/
230
+
public function keywords()
231
+
{
232
+
$keywords = [];
233
+
$phrases = $this->get();
234
+
235
+
foreach ($phrases as $phrase) {
236
+
$words = explode(' ', $phrase);
237
+
foreach ($words as $word) {
238
+
// This may look weird to the casual observer
239
+
// but we do this since PHP will convert string
240
+
// array keys that look like integers to actual
241
+
// integers. This may cause problems further
242
+
// down the line when a developer attempts to
243
+
// append arrays to one another and one of them
244
+
// have a mix of integer and string keys.
245
+
if (!$this->filter_numerics || ($this->filter_numerics && !is_numeric($word))) {
246
+
if ($this->min_length === 0 || mb_strlen($word) >= $this->min_length) {
247
+
$keywords[$word] = $word;
248
+
}
249
+
}
250
+
}
251
+
}
252
+
253
+
return array_values($keywords);
254
+
}
255
+
256
+
/**
257
+
* Sorts the phrases by score, use 'asc' or 'desc' to specify a
258
+
* sort order.
259
+
*
260
+
* @param string $order Default is 'asc'
261
+
*
262
+
* @return $this
263
+
*/
264
+
public function sortByScore($order = self::ORDER_ASC)
265
+
{
266
+
if ($order == self::ORDER_DESC) {
267
+
arsort($this->phrase_scores);
268
+
} else {
269
+
asort($this->phrase_scores);
270
+
}
271
+
272
+
return $this;
273
+
}
274
+
275
+
/**
276
+
* Sorts the phrases alphabetically, use 'asc' or 'desc' to specify a
277
+
* sort order.
278
+
*
279
+
* @param string $order Default is 'asc'
280
+
*
281
+
* @return $this
282
+
*/
283
+
public function sort($order = self::ORDER_ASC)
284
+
{
285
+
if ($order == self::ORDER_DESC) {
286
+
krsort($this->phrase_scores);
287
+
} else {
288
+
ksort($this->phrase_scores);
289
+
}
290
+
291
+
return $this;
292
+
}
293
+
294
+
/**
295
+
* Returns the current language being used.
296
+
*
297
+
* @return string
298
+
*/
299
+
public function language()
300
+
{
301
+
return $this->language;
302
+
}
303
+
304
+
/**
305
+
* Returns the language file that was loaded. Will
306
+
* be null if no file is loaded.
307
+
*
308
+
* @return string|null
309
+
*/
310
+
public function languageFile()
311
+
{
312
+
return $this->language_file;
313
+
}
314
+
315
+
/**
316
+
* Splits the text into an array of sentences.
317
+
*
318
+
* @param string $text
319
+
*
320
+
* @return array
321
+
*/
322
+
private function splitSentences($text)
323
+
{
324
+
return preg_split('/' . $this->sentence_regex . '/',
325
+
preg_replace('/' . $this->line_terminator . '/', ' ', $text));
326
+
}
327
+
328
+
/**
329
+
* Splits the text into an array of sentences. Uses mb_* functions.
330
+
*
331
+
* @param string $text
332
+
*
333
+
* @return array
334
+
*/
335
+
private function splitSentencesMb($text)
336
+
{
337
+
return mb_split($this->sentence_regex,
338
+
mb_ereg_replace($this->line_terminator, ' ', $text));
339
+
}
340
+
341
+
/**
342
+
* Split sentences into phrases by using the stopwords.
343
+
*
344
+
* @param array $sentences
345
+
* @param string $pattern
346
+
*
347
+
* @return array
348
+
*/
349
+
private function getPhrases(array $sentences, $pattern)
350
+
{
351
+
$results = [];
352
+
353
+
foreach ($sentences as $sentence) {
354
+
$phrases_temp = preg_replace($pattern, '|', $sentence);
355
+
$phrases = explode('|', $phrases_temp);
356
+
foreach ($phrases as $phrase) {
357
+
$phrase = mb_strtolower(trim($phrase));
358
+
if (!empty($phrase)) {
359
+
if (!$this->filter_numerics || ($this->filter_numerics && !is_numeric($phrase))) {
360
+
if ($this->min_length === 0 || mb_strlen($phrase) >= $this->min_length) {
361
+
$results[] = $phrase;
362
+
}
363
+
}
364
+
}
365
+
}
366
+
}
367
+
368
+
return $results;
369
+
}
370
+
371
+
/**
372
+
* Split sentences into phrases by using the stopwords. Makes use of
373
+
* PHP's mb_* functions.
374
+
*
375
+
* @param array $sentences
376
+
* @param string $pattern
377
+
*
378
+
* @return array
379
+
*/
380
+
private function getPhrasesMb(array $sentences, $pattern)
381
+
{
382
+
$results = [];
383
+
384
+
foreach ($sentences as $sentence) {
385
+
$phrases_temp = mb_eregi_replace($pattern, '|', $sentence);
386
+
$phrases = explode('|', $phrases_temp);
387
+
foreach ($phrases as $phrase) {
388
+
$phrase = mb_strtolower(preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $phrase));
389
+
if (!empty($phrase)) {
390
+
if (!$this->filter_numerics || ($this->filter_numerics && !is_numeric($phrase))) {
391
+
if ($this->min_length === 0 || mb_strlen($phrase) >= $this->min_length) {
392
+
$results[] = $phrase;
393
+
}
394
+
}
395
+
}
396
+
}
397
+
}
398
+
399
+
return $results;
400
+
}
401
+
402
+
/**
403
+
* Calculate a score for each word.
404
+
*
405
+
* @param array $phrases
406
+
*
407
+
* @return array
408
+
*/
409
+
private function calcWordScores($phrases)
410
+
{
411
+
$frequencies = [];
412
+
$degrees = [];
413
+
414
+
foreach ($phrases as $phrase) {
415
+
$words = $this->splitPhraseIntoWords($phrase);
416
+
$words_count = count($words);
417
+
$words_degree = $words_count - 1;
418
+
419
+
foreach ($words as $w) {
420
+
$frequencies[$w] = (isset($frequencies[$w])) ? $frequencies[$w] : 0;
421
+
$frequencies[$w] += 1;
422
+
$degrees[$w] = (isset($degrees[$w])) ? $degrees[$w] : 0;
423
+
$degrees[$w] += $words_degree;
424
+
}
425
+
}
426
+
427
+
foreach ($frequencies as $word => $freq) {
428
+
$degrees[$word] += $freq;
429
+
}
430
+
431
+
$scores = [];
432
+
foreach ($frequencies as $word => $freq) {
433
+
$scores[$word] = (isset($scores[$word])) ? $scores[$word] : 0;
434
+
$scores[$word] = $degrees[$word] / (float)$freq;
435
+
}
436
+
437
+
return $scores;
438
+
}
439
+
440
+
/**
441
+
* Calculate score for each phrase by word scores.
442
+
*
443
+
* @param array $phrases
444
+
* @param array $scores
445
+
*
446
+
* @return array
447
+
*/
448
+
private function calcPhraseScores($phrases, $scores)
449
+
{
450
+
$keywords = [];
451
+
452
+
foreach ($phrases as $phrase) {
453
+
$keywords[$phrase] = (isset($keywords[$phrase])) ? $keywords[$phrase] : 0;
454
+
$words = $this->splitPhraseIntoWords($phrase);
455
+
$score = 0;
456
+
457
+
foreach ($words as $word) {
458
+
$score += $scores[$word];
459
+
}
460
+
461
+
$keywords[$phrase] = $score;
462
+
}
463
+
464
+
return $keywords;
465
+
}
466
+
467
+
/**
468
+
* Split a phrase into multiple words and returns them
469
+
* as an array.
470
+
*
471
+
* @param string $phrase
472
+
*
473
+
* @return array
474
+
*/
475
+
private function splitPhraseIntoWords($phrase)
476
+
{
477
+
return array_filter(preg_split('/\W+/u', $phrase, -1, PREG_SPLIT_NO_EMPTY), function ($word) {
478
+
return !is_numeric($word);
479
+
});
480
+
}
481
+
482
+
/**
483
+
* Returns the minimum number of letters each phrase/keyword must have.
484
+
*
485
+
* @return int
486
+
*/
487
+
public function getMinLength()
488
+
{
489
+
return $this->min_length;
490
+
}
491
+
492
+
/**
493
+
* Sets the minimum number of letters each phrase/keyword must have.
494
+
*
495
+
* @param int $min_length
496
+
*
497
+
* @return RakePlus
498
+
*/
499
+
public function setMinLength($min_length)
500
+
{
501
+
if ((int)$min_length < 0) {
502
+
throw new InvalidArgumentException('Minimum phrase length must be greater than or equal to 0.');
503
+
}
504
+
505
+
$this->min_length = (int)$min_length;
506
+
return $this;
507
+
}
508
+
509
+
/**
510
+
* Sets whether numeric-only phrases/keywords should be filtered
511
+
* out or not.
512
+
*
513
+
* @param $filter_numerics
514
+
*
515
+
* @return RakePlus
516
+
*/
517
+
public function setFilterNumerics($filter_numerics = true)
518
+
{
519
+
$this->filter_numerics = $filter_numerics;
520
+
return $this;
521
+
}
522
+
523
+
/**
524
+
* Returns whether numeric-only phrases/keywords will be filtered
525
+
* out or not.
526
+
*
527
+
* @return bool
528
+
*/
529
+
public function getFilterNumerics()
530
+
{
531
+
return $this->filter_numerics;
532
+
}
533
+
}
534
+