Diff: STRATO-apps/wordpress_03/app/wp-content/plugins/aimogen-pro/res/rake-php-plus/console/extractor.php

Keine Baseline-Datei – Diff nur gegen leer.
Zur Liste
1 -
1 + <?php
2 +
3 + /**
4 + * Stopwords are either supplied in simple text files that
5 + * are copied from web pages such as this:
6 + * http://www.lextek.com/manuals/onix/stopwords2.html
7 + *
8 + * or it can be supplied as a .json file that is stored in the
9 + * format ["a","a's","able","about","above", .... ]
10 + *
11 + * This tool extracts the stopwords from these files and
12 + * produces either a .php output (containing a PHP array)
13 + * or a .pattern file containing a regular expression pattern.
14 + *
15 + * Usage:
16 + * To generate PHP output:
17 + * php -q extractor.php stopwords_en_US.txt --output=php
18 + *
19 + * To generate a regular expression pattern:
20 + * php -q extractor.php stopwords_en_US.txt --output=pattern
21 + *
22 + * To generate a regular expression pattern from a php array:
23 + * php -q extractor.php en_US.php --output=pattern
24 + *
25 + * Sorting the keywords in descending order, e.g. Z -> A is
26 + * important and for the tool to sort languages other than
27 + * English properly it needs to set the locale using PHP's
28 + * setlocale() function which depends on your system's
29 + * available locals. To check your locals on Linux run:
30 + *
31 + * $ local -a
32 + *
33 + * To install more locals:
34 + *
35 + * $ sudo locale-gen es_AR
36 + * $ sudo locale-gen es_AR.utf8
37 + */
38 +
39 + /**
40 + * @param int $arg_count
41 + */
42 + function check_args($arg_count)
43 + {
44 + if ($arg_count < 2) {
45 + echo "\n";
46 + echo "Error: Please specify the filename of the stopwords file to extract.\n";
47 + echo "Example:\n";
48 + echo " php ./console/extractor.php stopwords_en_US.txt --locale=en_US --output=php\n";
49 + echo " php ./console extractor.php stopwords_en_US.json --locale=en_US --output=php\n";
50 + echo "\n";
51 + echo "For better RakePlus performance, use the --output argument to produce\n";
52 + echo "regular expression pattern instead of a PHP script.\n";
53 + echo "Example:\n";
54 + echo " php ./console/extractor.php stopwords_en_US.txt --locale=en_US --output=pattern\n";
55 + echo " php ./console/extractor.php stopwords_en_US.json --locale=en_US --output=pattern\n";
56 + echo "\n";
57 + echo "You can pipe the output of this tool directly into a\n";
58 + echo ".php or .pattern file:\n";
59 + echo "Example:\n";
60 + echo " php ./console/extractor.php stopwords_en_US.txt --locale=en_US --output=php > en_US.php\n";
61 + echo " php ./console/extractor.php stopwords_en_US.json --locale=en_US --output=pattern > en_US.pattern\n";
62 + echo " php ./console/extractor.php en_US.php --locale=en_US --output=pattern > en_US.pattern\n";
63 + echo "\n";
64 +
65 + exit(1);
66 + }
67 + }
68 +
69 + /**
70 + * @param array $args
71 + * @param int $arg_no
72 + * @param mixed $default
73 + *
74 + * @return mixed
75 + */
76 + function get_arg_by_index($args, $arg_no, $default = null)
77 + {
78 + if ($arg_no < count($args)) {
79 + return $args[$arg_no];
80 + } else {
81 + return $default;
82 + }
83 + }
84 +
85 + /**
86 + * @param array $args
87 + * @param string $name
88 + * @param mixed $default
89 + *
90 + * @return mixed
91 + */
92 + function get_arg_by_name($args, $name, $default = null)
93 + {
94 + foreach ($args as $arg) {
95 + list($key, $value) = array_pad(explode('=', $arg), 2, $default);
96 + if ($key == $name) {
97 + return $value;
98 + }
99 + }
100 +
101 + return $default;
102 + }
103 +
104 + /**
105 + * Returns true if one if the arguments consists
106 + * of the supplied $arg.
107 + *
108 + * @param $args
109 + * @param $name
110 + *
111 + * @return mixed
112 + */
113 + function has_arg($args, $name)
114 + {
115 + foreach ($args as $arg) {
116 + if ($arg == $name) {
117 + return true;
118 + }
119 + }
120 +
121 + return false;
122 + }
123 +
124 + /**
125 + * @param string $stopwords_file
126 + *
127 + * @return array
128 + */
129 + function load_stopwords($stopwords_file)
130 + {
131 + $stopwords = [];
132 +
133 + $ext = pathinfo($stopwords_file, PATHINFO_EXTENSION);
134 + if (!file_exists($stopwords_file)) {
135 + echo "\n";
136 + echo "Error: Stopwords file \"{$stopwords_file}\" not found.\n";
137 + echo "\n";
138 + exit(1);
139 + }
140 +
141 + if ($ext === 'txt') {
142 + if ($h = @fopen($stopwords_file, 'r')) {
143 + while (($line = fgets($h)) !== false) {
144 + $line = preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $line);
145 + if (!empty($line) && $line[0] != '#') {
146 + $stopwords[$line] = true;
147 + }
148 + }
149 +
150 + return array_keys($stopwords);
151 + } else {
152 + echo "\n";
153 + echo "Error: Could not read text file \"{$stopwords_file}\".\n";
154 + echo "\n";
155 + exit(1);
156 + }
157 + }
158 +
159 + if ($ext === 'json') {
160 + $stopwords = json_decode(file_get_contents($stopwords_file), true);
161 + return array_keys(array_fill_keys($stopwords, true));
162 + }
163 +
164 + if ($ext === 'php') {
165 + /** @noinspection PhpIncludeInspection */
166 + $stopwords = require $stopwords_file;
167 + return array_keys(array_fill_keys($stopwords, true));
168 + }
169 +
170 + return [];
171 + }
172 +
173 + /**
174 + * @param array $stopwords
175 + *
176 + * @throws Exception
177 + */
178 + function render_php_output(array $stopwords)
179 + {
180 + $stopword_count = count($stopwords);
181 + $timestamp = (new DateTime('now', new DateTimeZone('UTC')))->format(DateTime::ATOM);
182 +
183 + echo "\xEF\xBB\xBF<?php\n";
184 + echo "\n";
185 + echo "/**\n";
186 + echo " * Stopwords list for the use in the PHP package rake-php-plus.\n";
187 + echo " * See: https://github.com/Donatello-za/rake-php-plus\n";
188 + echo " *\n";
189 + echo " * Extracted using extractor.php @ {$timestamp} \n";
190 + echo " */\n";
191 + echo "\n";
192 + echo 'return [' . "\n";
193 +
194 + for ($i = 0; $i < $stopword_count; $i++) {
195 + if ($i == ($stopword_count - 1)) {
196 + echo " '" . str_replace("'", "\\'", $stopwords[$i]) . "'\n";
197 + } else {
198 + echo " '" . str_replace("'", "\\'", $stopwords[$i]) . "',\n";
199 + }
200 + }
201 +
202 + echo "];\n";
203 + echo "\n";
204 + }
205 +
206 + /**
207 + * @param array $stopwords
208 + */
209 + function render_pattern_output(array $stopwords)
210 + {
211 + $regex = [];
212 +
213 + foreach ($stopwords as $word) {
214 + if (mb_strlen($word) === 1) {
215 + // This pattern allows for words such as a-class and j'aimerais, however,
216 + // words such as day-z will be broken up into day- and the z will go
217 + // missing. A possible workaround is to set the pattern as:
218 + // '\b(?!-)' . $word . '(?!(-|\'))\b'
219 + // but then two character words such as WA will also be stripped out.
220 + $regex[] = '\b' . $word . '(?!(-|\'))\b';
221 + // $regex[] = '\b(?!-)' . $word . '(?!(-|\'))\b';
222 + } else {
223 + $regex[] = '\b' . $word . '\b';
224 + }
225 + }
226 +
227 + echo "\xEF\xBB\xBF".'/' . implode('|', $regex) . '/i' . "\n";
228 + }
229 +
230 + /**
231 + * @param array $stopwords
232 + */
233 + function render_json_output(array $stopwords)
234 + {
235 + echo json_encode($stopwords, JSON_PRETTY_PRINT) . "\n";
236 + }
237 +
238 + /**
239 + * @param array $stopwords
240 + * @param string $stopwords_file
241 + * @param string $output
242 + *
243 + * @throws Exception
244 + */
245 + function render_output(array $stopwords, $stopwords_file, $output = 'php')
246 + {
247 + if (count($stopwords) > 0) {
248 + if ($output == 'pattern') {
249 + render_pattern_output($stopwords);
250 + } else if ($output == 'php') {
251 + render_php_output($stopwords);
252 + } else if ($output == 'json') {
253 + render_json_output($stopwords);
254 + }
255 +
256 + } else {
257 + echo "\n";
258 + echo "Error: No stopwords found in file \"{$stopwords_file}\".\n";
259 + echo "\n";
260 + exit(1);
261 + }
262 + }
263 +
264 + check_args($argc);
265 +
266 + $stopwords_file = get_arg_by_index($argv, 1);
267 + $stopwords = load_stopwords($stopwords_file);
268 +
269 + $locale = get_arg_by_name($argv, '--locale');
270 + if ($locale === null) {
271 + echo "Please specify the locale, e.g. --locale=en_US\n";
272 + }
273 +
274 + if (!has_arg($argv, '--nosort')) {
275 + $result = setlocale(LC_COLLATE, $locale . '.utf8');
276 + if (!has_arg($argv, '--ascending')) {
277 + usort($stopwords, function ($a, $b) {
278 + return strcoll($b, $a);
279 + });
280 + } else {
281 + usort($stopwords, function ($a, $b) {
282 + return strcoll($a, $b);
283 + });
284 + }
285 +
286 + /*
287 + if (!has_arg($argv, '--ascending')) {
288 + rsort($stopwords);
289 + } else {
290 + sort($stopwords);
291 + }
292 + */
293 + }
294 +
295 + $OUTPUT_TYPES = ['pattern', 'php', 'json'];
296 + $output = get_arg_by_name($argv, '--output');
297 + if (!in_array($output, $OUTPUT_TYPES)) {
298 + echo "Please specify the output format, e.g. --output=pattern, --output=php or --output=json\n";
299 + exit(1);
300 + }
301 +
302 + /** @noinspection PhpUnhandledExceptionInspection */
303 + render_output($stopwords, $stopwords_file, $output);
304 +
305 +