Diff: STRATO-apps/wordpress_03/app/wp-content/plugins/aimogen-pro/res/rake-php-plus/console/extractor.php
Keine Baseline-Datei – Diff nur gegen leer.
1
-
1
+
<?php
2
+
3
+
/**
4
+
* Stopwords are either supplied in simple text files that
5
+
* are copied from web pages such as this:
6
+
* http://www.lextek.com/manuals/onix/stopwords2.html
7
+
*
8
+
* or it can be supplied as a .json file that is stored in the
9
+
* format ["a","a's","able","about","above", .... ]
10
+
*
11
+
* This tool extracts the stopwords from these files and
12
+
* produces either a .php output (containing a PHP array)
13
+
* or a .pattern file containing a regular expression pattern.
14
+
*
15
+
* Usage:
16
+
* To generate PHP output:
17
+
* php -q extractor.php stopwords_en_US.txt --output=php
18
+
*
19
+
* To generate a regular expression pattern:
20
+
* php -q extractor.php stopwords_en_US.txt --output=pattern
21
+
*
22
+
* To generate a regular expression pattern from a php array:
23
+
* php -q extractor.php en_US.php --output=pattern
24
+
*
25
+
* Sorting the keywords in descending order, e.g. Z -> A is
26
+
* important and for the tool to sort languages other than
27
+
* English properly it needs to set the locale using PHP's
28
+
* setlocale() function which depends on your system's
29
+
* available locals. To check your locals on Linux run:
30
+
*
31
+
* $ local -a
32
+
*
33
+
* To install more locals:
34
+
*
35
+
* $ sudo locale-gen es_AR
36
+
* $ sudo locale-gen es_AR.utf8
37
+
*/
38
+
39
+
/**
40
+
* @param int $arg_count
41
+
*/
42
+
function check_args($arg_count)
43
+
{
44
+
if ($arg_count < 2) {
45
+
echo "\n";
46
+
echo "Error: Please specify the filename of the stopwords file to extract.\n";
47
+
echo "Example:\n";
48
+
echo " php ./console/extractor.php stopwords_en_US.txt --locale=en_US --output=php\n";
49
+
echo " php ./console extractor.php stopwords_en_US.json --locale=en_US --output=php\n";
50
+
echo "\n";
51
+
echo "For better RakePlus performance, use the --output argument to produce\n";
52
+
echo "regular expression pattern instead of a PHP script.\n";
53
+
echo "Example:\n";
54
+
echo " php ./console/extractor.php stopwords_en_US.txt --locale=en_US --output=pattern\n";
55
+
echo " php ./console/extractor.php stopwords_en_US.json --locale=en_US --output=pattern\n";
56
+
echo "\n";
57
+
echo "You can pipe the output of this tool directly into a\n";
58
+
echo ".php or .pattern file:\n";
59
+
echo "Example:\n";
60
+
echo " php ./console/extractor.php stopwords_en_US.txt --locale=en_US --output=php > en_US.php\n";
61
+
echo " php ./console/extractor.php stopwords_en_US.json --locale=en_US --output=pattern > en_US.pattern\n";
62
+
echo " php ./console/extractor.php en_US.php --locale=en_US --output=pattern > en_US.pattern\n";
63
+
echo "\n";
64
+
65
+
exit(1);
66
+
}
67
+
}
68
+
69
+
/**
70
+
* @param array $args
71
+
* @param int $arg_no
72
+
* @param mixed $default
73
+
*
74
+
* @return mixed
75
+
*/
76
+
function get_arg_by_index($args, $arg_no, $default = null)
77
+
{
78
+
if ($arg_no < count($args)) {
79
+
return $args[$arg_no];
80
+
} else {
81
+
return $default;
82
+
}
83
+
}
84
+
85
+
/**
86
+
* @param array $args
87
+
* @param string $name
88
+
* @param mixed $default
89
+
*
90
+
* @return mixed
91
+
*/
92
+
function get_arg_by_name($args, $name, $default = null)
93
+
{
94
+
foreach ($args as $arg) {
95
+
list($key, $value) = array_pad(explode('=', $arg), 2, $default);
96
+
if ($key == $name) {
97
+
return $value;
98
+
}
99
+
}
100
+
101
+
return $default;
102
+
}
103
+
104
+
/**
105
+
* Returns true if one if the arguments consists
106
+
* of the supplied $arg.
107
+
*
108
+
* @param $args
109
+
* @param $name
110
+
*
111
+
* @return mixed
112
+
*/
113
+
function has_arg($args, $name)
114
+
{
115
+
foreach ($args as $arg) {
116
+
if ($arg == $name) {
117
+
return true;
118
+
}
119
+
}
120
+
121
+
return false;
122
+
}
123
+
124
+
/**
125
+
* @param string $stopwords_file
126
+
*
127
+
* @return array
128
+
*/
129
+
function load_stopwords($stopwords_file)
130
+
{
131
+
$stopwords = [];
132
+
133
+
$ext = pathinfo($stopwords_file, PATHINFO_EXTENSION);
134
+
if (!file_exists($stopwords_file)) {
135
+
echo "\n";
136
+
echo "Error: Stopwords file \"{$stopwords_file}\" not found.\n";
137
+
echo "\n";
138
+
exit(1);
139
+
}
140
+
141
+
if ($ext === 'txt') {
142
+
if ($h = @fopen($stopwords_file, 'r')) {
143
+
while (($line = fgets($h)) !== false) {
144
+
$line = preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $line);
145
+
if (!empty($line) && $line[0] != '#') {
146
+
$stopwords[$line] = true;
147
+
}
148
+
}
149
+
150
+
return array_keys($stopwords);
151
+
} else {
152
+
echo "\n";
153
+
echo "Error: Could not read text file \"{$stopwords_file}\".\n";
154
+
echo "\n";
155
+
exit(1);
156
+
}
157
+
}
158
+
159
+
if ($ext === 'json') {
160
+
$stopwords = json_decode(file_get_contents($stopwords_file), true);
161
+
return array_keys(array_fill_keys($stopwords, true));
162
+
}
163
+
164
+
if ($ext === 'php') {
165
+
/** @noinspection PhpIncludeInspection */
166
+
$stopwords = require $stopwords_file;
167
+
return array_keys(array_fill_keys($stopwords, true));
168
+
}
169
+
170
+
return [];
171
+
}
172
+
173
+
/**
174
+
* @param array $stopwords
175
+
*
176
+
* @throws Exception
177
+
*/
178
+
function render_php_output(array $stopwords)
179
+
{
180
+
$stopword_count = count($stopwords);
181
+
$timestamp = (new DateTime('now', new DateTimeZone('UTC')))->format(DateTime::ATOM);
182
+
183
+
echo "\xEF\xBB\xBF<?php\n";
184
+
echo "\n";
185
+
echo "/**\n";
186
+
echo " * Stopwords list for the use in the PHP package rake-php-plus.\n";
187
+
echo " * See: https://github.com/Donatello-za/rake-php-plus\n";
188
+
echo " *\n";
189
+
echo " * Extracted using extractor.php @ {$timestamp} \n";
190
+
echo " */\n";
191
+
echo "\n";
192
+
echo 'return [' . "\n";
193
+
194
+
for ($i = 0; $i < $stopword_count; $i++) {
195
+
if ($i == ($stopword_count - 1)) {
196
+
echo " '" . str_replace("'", "\\'", $stopwords[$i]) . "'\n";
197
+
} else {
198
+
echo " '" . str_replace("'", "\\'", $stopwords[$i]) . "',\n";
199
+
}
200
+
}
201
+
202
+
echo "];\n";
203
+
echo "\n";
204
+
}
205
+
206
+
/**
207
+
* @param array $stopwords
208
+
*/
209
+
function render_pattern_output(array $stopwords)
210
+
{
211
+
$regex = [];
212
+
213
+
foreach ($stopwords as $word) {
214
+
if (mb_strlen($word) === 1) {
215
+
// This pattern allows for words such as a-class and j'aimerais, however,
216
+
// words such as day-z will be broken up into day- and the z will go
217
+
// missing. A possible workaround is to set the pattern as:
218
+
// '\b(?!-)' . $word . '(?!(-|\'))\b'
219
+
// but then two character words such as WA will also be stripped out.
220
+
$regex[] = '\b' . $word . '(?!(-|\'))\b';
221
+
// $regex[] = '\b(?!-)' . $word . '(?!(-|\'))\b';
222
+
} else {
223
+
$regex[] = '\b' . $word . '\b';
224
+
}
225
+
}
226
+
227
+
echo "\xEF\xBB\xBF".'/' . implode('|', $regex) . '/i' . "\n";
228
+
}
229
+
230
+
/**
231
+
* @param array $stopwords
232
+
*/
233
+
function render_json_output(array $stopwords)
234
+
{
235
+
echo json_encode($stopwords, JSON_PRETTY_PRINT) . "\n";
236
+
}
237
+
238
+
/**
239
+
* @param array $stopwords
240
+
* @param string $stopwords_file
241
+
* @param string $output
242
+
*
243
+
* @throws Exception
244
+
*/
245
+
function render_output(array $stopwords, $stopwords_file, $output = 'php')
246
+
{
247
+
if (count($stopwords) > 0) {
248
+
if ($output == 'pattern') {
249
+
render_pattern_output($stopwords);
250
+
} else if ($output == 'php') {
251
+
render_php_output($stopwords);
252
+
} else if ($output == 'json') {
253
+
render_json_output($stopwords);
254
+
}
255
+
256
+
} else {
257
+
echo "\n";
258
+
echo "Error: No stopwords found in file \"{$stopwords_file}\".\n";
259
+
echo "\n";
260
+
exit(1);
261
+
}
262
+
}
263
+
264
+
check_args($argc);
265
+
266
+
$stopwords_file = get_arg_by_index($argv, 1);
267
+
$stopwords = load_stopwords($stopwords_file);
268
+
269
+
$locale = get_arg_by_name($argv, '--locale');
270
+
if ($locale === null) {
271
+
echo "Please specify the locale, e.g. --locale=en_US\n";
272
+
}
273
+
274
+
if (!has_arg($argv, '--nosort')) {
275
+
$result = setlocale(LC_COLLATE, $locale . '.utf8');
276
+
if (!has_arg($argv, '--ascending')) {
277
+
usort($stopwords, function ($a, $b) {
278
+
return strcoll($b, $a);
279
+
});
280
+
} else {
281
+
usort($stopwords, function ($a, $b) {
282
+
return strcoll($a, $b);
283
+
});
284
+
}
285
+
286
+
/*
287
+
if (!has_arg($argv, '--ascending')) {
288
+
rsort($stopwords);
289
+
} else {
290
+
sort($stopwords);
291
+
}
292
+
*/
293
+
}
294
+
295
+
$OUTPUT_TYPES = ['pattern', 'php', 'json'];
296
+
$output = get_arg_by_name($argv, '--output');
297
+
if (!in_array($output, $OUTPUT_TYPES)) {
298
+
echo "Please specify the output format, e.g. --output=pattern, --output=php or --output=json\n";
299
+
exit(1);
300
+
}
301
+
302
+
/** @noinspection PhpUnhandledExceptionInspection */
303
+
render_output($stopwords, $stopwords_file, $output);
304
+
305
+