Projekt-Ökonomie Demo

1 + <?php

2 +

3 + namespace fivefilters\Readability;

4 +

5 + defined('ABSPATH') or die();

6 + use fivefilters\Readability\Nodes\DOM\DOMDocument;

7 + use fivefilters\Readability\Nodes\DOM\DOMElement;

8 + use fivefilters\Readability\Nodes\DOM\DOMNode;

9 + use fivefilters\Readability\Nodes\DOM\DOMText;

10 + use fivefilters\Readability\Nodes\NodeUtility;

11 + use Psr\Log\LoggerInterface;

12 + use \Masterminds\HTML5;

13 + use League\Uri\Http;

14 + use League\Uri\UriResolver;

15 +

16 + /**

17 + * Class Readability.

18 + */

19 + class Readability

20 + {

21 + /**

22 + * Main DOMDocument where all the magic happens.

23 + *

24 + * @var DOMDocument

25 + */

26 + protected $dom;

27 +

28 + /**

29 + * Title of the article.

30 + *

31 + * @var string|null

32 + */

33 + protected $title = null;

34 +

35 + /**

36 + * Final DOMDocument with the fully parsed HTML.

37 + *

38 + * @var DOMDocument|null

39 + */

40 + protected $content = null;

41 +

42 + /**

43 + * Excerpt of the article.

44 + *

45 + * @var string|null

46 + */

47 + protected $excerpt = null;

48 +

49 + /**

50 + * Main image of the article.

51 + *

52 + * @var string|null

53 + */

54 + protected $image = null;

55 +

56 + /**

57 + * Author of the article. Extracted from the byline tags and other social media properties.

58 + *

59 + * @var string|null

60 + */

61 + protected $author = null;

62 +

63 + /**

64 + * Website name.

65 + *

66 + * @var string|null

67 + */

68 + protected $siteName = null;

69 +

70 + /**

71 + * Direction of the text.

72 + *

73 + * @var string|null

74 + */

75 + protected $direction = null;

76 +

77 + /**

78 + * Base URI

79 + * HTML5PHP doesn't appear to store it in the baseURI property like PHP's DOMDocument does when parsing with libxml

80 + *

81 + * @var string|null

82 + */

83 + protected $baseURI = null;

84 +

85 + /**

86 + * Configuration object.

87 + *

88 + * @var Configuration

89 + */

90 + private $configuration;

91 +

92 + /**

93 + * Logger object.

94 + *

95 + * @var LoggerInterface

96 + */

97 + private $logger;

98 +

99 + /**

100 + * JSON-LD

101 + *

102 + * @var array

103 + */

104 + private $jsonld = [];

105 +

106 + /**

107 + * Collection of attempted text extractions.

108 + *

109 + * @var array

110 + */

111 + private $attempts = [];

112 +

113 + /**

114 + * @var array

115 + */

116 + private $defaultTagsToScore = [

117 + 'section',

118 + 'h2',

119 + 'h3',

120 + 'h4',

121 + 'h5',

122 + 'h6',

123 + 'p',

124 + 'td',

125 + 'pre',

126 + ];

127 +

128 + /**

129 + * @var array

130 + */

131 + private $unlikelyRoles = ['menu', 'menubar', 'complementary', 'navigation', 'alert', 'alertdialog', 'dialog'];

132 +

133 + /**

134 + * @var array

135 + */

136 + private $alterToDIVExceptions = [

137 + 'div',

138 + 'article',

139 + 'section',

140 + 'p',

141 + ];

142 +

143 + /**

144 + * @var array

145 + */

146 + private $htmlEscapeMap = [

147 + 'lt' => '<',

148 + 'gt' => '>',

149 + 'amp' => '&',

150 + 'quot' => '"',

151 + 'apos' => '\'',

152 + ];

153 +

154 + /**

155 + * Readability constructor.

156 + *

157 + * @param Configuration $configuration

158 + */

159 + public function __construct(Configuration $configuration)

160 + {

161 + $this->configuration = $configuration;

162 + $this->logger = $this->configuration->getLogger();

163 + }

164 +

165 + /**

166 + * Main parse function.

167 + *

168 + * @param $html

169 + *

170 + * @throws ParseException

171 + *

172 + * @return bool

173 + */

174 + public function parse($html)

175 + {

176 + $this->logger->info('*** Starting parse process...');

177 +

178 + $this->dom = $this->loadHTML($html);

179 +

180 + // Checking for minimum HTML to work with.

181 + if (!($root = $this->dom->getElementsByTagName('body')->item(0)) || !$root->firstChild) {

182 + $this->logger->emergency('No body tag present or body tag empty');

183 +

184 + throw new ParseException('Invalid or incomplete HTML.');

185 + }

186 +

187 + $this->getMetadata();

188 +

189 + $this->getMainImage();

190 +

191 + while (true) {

192 + $this->logger->debug('Starting parse loop');

193 + $root = $root->firstChild;

194 +

195 + $elementsToScore = $this->getNodes($root);

196 + $this->logger->debug(sprintf('Elements to score: \'%s\'', count($elementsToScore)));

197 +

198 + $result = $this->rateNodes($elementsToScore);

199 +

200 + /*

201 + * Now that we've gone through the full algorithm, check to see if

202 + * we got any meaningful content. If we didn't, we may need to re-run

203 + * grabArticle with different flags set. This gives us a higher likelihood of

204 + * finding the content, and the sieve approach gives us a higher likelihood of

205 + * finding the -right- content.

206 + */

207 +

208 + $length = readability_mb_strlen(preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $result->textContent));

209 +

210 + $this->logger->info(sprintf('[Parsing] Article parsed. Amount of words: %s. Current threshold is: %s', $length, $this->configuration->getCharThreshold()));

211 +

212 + if ($result && $length < $this->configuration->getCharThreshold()) {

213 + $this->dom = $this->loadHTML($html);

214 + $root = $this->dom->getElementsByTagName('body')->item(0);

215 +

216 + if ($this->configuration->getStripUnlikelyCandidates()) {

217 + $this->logger->debug('[Parsing] Threshold not met, trying again setting StripUnlikelyCandidates as false');

218 + $this->configuration->setStripUnlikelyCandidates(false);

219 + $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];

220 + } elseif ($this->configuration->getWeightClasses()) {

221 + $this->logger->debug('[Parsing] Threshold not met, trying again setting WeightClasses as false');

222 + $this->configuration->setWeightClasses(false);

223 + $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];

224 + } elseif ($this->configuration->getCleanConditionally()) {

225 + $this->logger->debug('[Parsing] Threshold not met, trying again setting CleanConditionally as false');

226 + $this->configuration->setCleanConditionally(false);

227 + $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];

228 + } else {

229 + $this->logger->debug('[Parsing] Threshold not met, searching across attempts for some content.');

230 + $this->attempts[] = ['articleContent' => $result, 'textLength' => $length];

231 +

232 + // No luck after removing flags, just return the longest text we found during the different loops

233 + usort($this->attempts, function ($a, $b) {

234 + return $b['textLength'] - $a['textLength'];

235 + });

236 +

237 + // But first check if we actually have something

238 + if (!$this->attempts[0]['textLength']) {

239 + $this->logger->emergency('[Parsing] Could not parse text, giving up :(');

240 +

241 + throw new ParseException('Could not parse text.');

242 + }

243 +

244 + $this->logger->debug('[Parsing] Threshold not met, but found some content in previous attempts.');

245 +

246 + $result = $this->attempts[0]['articleContent'];

247 + break;

248 + }

249 + } else {

250 + break;

251 + }

252 + }

253 +

254 + if (!$result) {

255 + $this->logger->info('*** Parse failed :(');

256 + return false;

257 + }

258 +

259 + $result = $this->postProcessContent($result);

260 +

261 + // If we haven't found an excerpt in the article's metadata, use the article's

262 + // first paragraph as the excerpt. This can be used for displaying a preview of

263 + // the article's content.

264 + if (!$this->getExcerpt()) {

265 + $this->logger->debug('[Parsing] No excerpt text found on metadata, extracting first p node and using it as excerpt.');

266 + $paragraphs = $result->getElementsByTagName('p');

267 + if ($paragraphs->length > 0) {

268 + $this->setExcerpt(trim($paragraphs->item(0)->textContent));

269 + }

270 + }

271 +

272 + $this->setContent($result);

273 +

274 + $this->logger->info('*** Parse successful :)');

275 +

276 + return true;

277 + }

278 +

279 + /**

280 + * Creates a DOM Document object and loads the provided HTML on it.

281 + *

282 + * Used for the first load of Readability and subsequent reloads (when disabling flags and rescanning the text)

283 + * Previous versions of Readability used this method one time and cloned the DOM to keep a backup. This caused bugs

284 + * because cloning the DOM object keeps a relation between the clone and the original one, doing changes in both

285 + * objects and ruining the backup.

286 + *

287 + * @param string $html

288 + *

289 + * @return DOMDocument

290 + */

291 + private function loadHTML($html)

292 + {

293 + $this->logger->debug('[Loading] Loading HTML...');

294 +

295 + // To avoid throwing a gazillion of errors on malformed HTMLs

296 + libxml_use_internal_errors(true);

297 +

298 + //$html = preg_replace('/(<br[^>]*>[ \n\r\t]*){2,}/i', '', $html);

299 +

300 + if ($this->configuration->getParser() === 'html5') {

301 + $this->logger->debug('[Loading] Using HTML5 parser...');

302 + $html5 = new HTML5(['disable_html_ns' => true, 'target_document' => new DOMDocument('1.0', 'utf-8')]);

303 + $dom = $html5->loadHTML($html);

304 + //TODO: Improve this so it looks inside <html><head><base>, not just any <base>

305 + $base = $dom->getElementsByTagName('base');

306 + if ($base->length > 0) {

307 + $base = $base->item(0);

308 + $base = $base->getAttribute('href');

309 + if ($base != '') {

310 + $this->baseURI = $base;

311 + }

312 + }

313 + } else {

314 + $this->logger->debug('[Loading] Using libxml parser...');

315 + $dom = new DOMDocument('1.0', 'utf-8');

316 + if ($this->configuration->getNormalizeEntities() && function_exists('mb_convert_encoding')) {

317 + $this->logger->debug('[Loading] Normalized entities via mb_convert_encoding.');

318 + // Replace UTF-8 characters with the HTML Entity equivalent. Useful to fix html with mixed content

319 + $html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');

320 + }

321 + }

322 +

323 + if (!$this->configuration->getSubstituteEntities()) {

324 + // Keep the original HTML entities

325 + $dom->substituteEntities = false;

326 + }

327 +

328 + if ($this->configuration->getSummonCthulhu()) {

329 + $this->logger->debug('[Loading] Removed script tags via regex H̶͈̩̟̬̱͠E̡̨̬͔̳̜͢͠ ̡̧̯͉̩͙̩̹̞̠͎͈̹̥̠͞ͅͅC̶͉̞̘̖̝̗͓̬̯͍͉̤̬͢͢͞Ò̟̘͉͖͎͉̱̭̣̕M̴̯͈̻̱̱̣̗͈̠̙̲̥͘͞E̷̛͙̼̲͍͕̹͍͇̗̻̬̮̭̱̥͢Ş̛̟͔̙̜̤͇̮͍̙̝̀͘');

330 + $html = preg_replace('/<script\b[^>]*>([\s\S]*?)<\/script>/', '', $html);

331 + }

332 +

333 + // Prepend the XML tag to avoid having issues with special characters. Should be harmless.

334 + if ($this->configuration->getParser() !== 'html5') {

335 + $dom->loadHTML('<?xml encoding="UTF-8">' . $html);

336 + $this->baseURI = $dom->baseURI;

337 + }

338 + $dom->encoding = 'UTF-8';

339 +

340 + // Unwrap image from noscript

341 + $this->unwrapNoscriptImages($dom);

342 +

343 + // Extract JSON-LD metadata before removing scripts

344 + $this->jsonld = $this->configuration->getDisableJSONLD() ? [] : $this->getJSONLD($dom);

345 +

346 + $this->removeScripts($dom);

347 +

348 + $this->prepDocument($dom);

349 +

350 + $this->logger->debug('[Loading] Loaded HTML successfully.');

351 +

352 + return $dom;

353 + }

354 +

355 + /**

356 + * Try to extract metadata from JSON-LD object.

357 + * For now, only Schema.org objects of type Article or its subtypes are supported.

358 + *

359 + * @param DOMDocument $dom

360 + * @return Object with any metadata that could be extracted (possibly none)

361 + */

362 + private function getJSONLD(DOMDocument $dom)

363 + {

364 + $scripts = $this->_getAllNodesWithTag($dom, ['script']);

365 +

366 + $jsonLdElement = $this->findNode($scripts, function ($el) {

367 + return $el->getAttribute('type') === 'application/ld+json';

368 + });

369 +

370 + if ($jsonLdElement) {

371 + try {

372 + // Strip CDATA markers if present

373 + $content = preg_replace('/^\s*<!\[CDATA\[|\]\]>\s*$/', '', $jsonLdElement->textContent);

374 + $parsed = json_decode($content, true);

375 + $metadata = [];

376 + if (

377 + !isset($parsed['@context']) ||

378 + !is_string($parsed['@context']) ||

379 + !preg_match('/^https?\:\/\/schema\.org$/', $parsed['@context'])

380 + ) {

381 + return $metadata;

382 + }

383 +

384 + if (!isset($parsed['@type']) && isset($parsed['@graph']) && is_array($parsed['@graph'])) {

385 + $_found = null;

386 + foreach ($parsed['@graph'] as $it) {

387 + if (isset($it['@type']) && is_string($it['@type']) && preg_match(NodeUtility::$regexps['jsonLdArticleTypes'], $it['@type'])) {

388 + $_found = $it;

389 + }

390 + }

391 + $parsed = $_found;

392 + }

393 +

394 + if (

395 + !$parsed ||

396 + !isset($parsed['@type']) ||

397 + !is_string($parsed['@type']) ||

398 + !preg_match(NodeUtility::$regexps['jsonLdArticleTypes'], $parsed['@type'])

399 + ) {

400 + return $metadata;

401 + }

402 + if (isset($parsed['name']) && is_string($parsed['name'])) {

403 + $metadata['title'] = trim($parsed['name']);

404 + } elseif (isset($parsed['headline']) && is_string($parsed['headline'])) {

405 + $metadata['title'] = trim($parsed['headline']);

406 + }

407 + if (isset($parsed['author'])) {

408 + if (isset($parsed['author']['name']) && is_string($parsed['author']['name'])) {

409 + $metadata['byline'] = trim($parsed['author']['name']);

410 + } elseif (

411 + is_array($parsed['author']) &&

412 + isset($parsed['author'][0]) &&

413 + is_array($parsed['author'][0]) &&

414 + isset($parsed['author'][0]['name']) &&

415 + is_string($parsed['author'][0]['name'])

416 + ) {

417 + $metadata['byline'] = array_filter($parsed['author'], function ($author) {

418 + return is_array($author) && isset($author['name']) && is_string($author['name']);

419 + });

420 + $metadata['byline'] = array_map(function ($author) {

421 + return trim($author['name']);

422 + }, $metadata['byline']);

423 + $metadata['byline'] = implode(', ', $metadata['byline']);

424 + }

425 + }

426 + if (isset($parsed['description']) && is_string($parsed['description'])) {

427 + $metadata['excerpt'] = trim($parsed['description']);

428 + }

429 + if (

430 + isset($parsed['publisher']) &&

431 + is_array($parsed['publisher']) &&

432 + isset($parsed['publisher']['name']) &&

433 + is_string($parsed['publisher']['name'])

434 + ) {

435 + $metadata['siteName'] = trim($parsed['publisher']['name']);

436 + }

437 + return $metadata;

438 + } catch (\Exception $err) {

439 + // The try-catch blocks are from the JS version. Not sure if there's anything

440 + // here in the PHP version that would trigger an error or exception, so perhaps we can

441 + // remove the try-catch blocks here (or at least translate errors to exceptions for this bit)

442 + $this->logger->debug('[JSON-LD] Error parsing: ' . $err->getMessage());

443 + }

444 + }

445 + return [];

446 + }

447 +

448 + /**

449 + * Tries to guess relevant info from metadata of the html. Sets the results in the Readability properties.

450 + */

451 + private function getMetadata()

452 + {

453 + $this->logger->debug('[Metadata] Retrieving metadata...');

454 +

455 + $values = [];

456 + // property is a space-separated list of values

458 +

459 + // name is a single value

461 +

462 + // Find description tags.

463 + foreach ($this->dom->getElementsByTagName('meta') as $meta) {

464 + /* @var DOMNode $meta */

465 + $elementName = $meta->getAttribute('name');

466 + $elementProperty = $meta->getAttribute('property');

467 + $content = $meta->getAttribute('content');

468 + $matches = null;

469 + $name = null;

470 +

471 + if ($elementProperty) {

472 + if (preg_match($propertyPattern, $elementProperty, $matches)) {

473 + $name = preg_replace('/\s/', '', readability_mb_strtolower($matches[0]));

474 + // multiple authors

475 + $values[$name] = trim($content);

476 + }

477 + }

478 +

479 + if (!$matches && $elementName && preg_match($namePattern, $elementName)) {

480 + $name = $elementName;

481 + if ($content) {

482 + // Convert to lowercase, remove any whitespace, and convert dots

483 + // to colons so we can match below.

484 + $name = preg_replace(['/\s/', '/\./'], ['', ':'], readability_mb_strtolower($name));

485 + $values[$name] = trim($content);

486 + }

487 + }

488 + }

489 +

490 + // get title

491 + /*

492 + * This is a very convoluted way of extracting the first matching key of the $values array

493 + * against a set of options.

494 + *

495 + * This could be easily replaced with an ugly set of isset($values['key']) or a bunch of ??s.

496 + * Will probably replace it with ??s after dropping support of PHP5.6

497 + */

498 + $key = current(array_intersect([

499 + 'dc:title',

500 + 'dcterm:title',

501 + 'og:title',

502 + 'weibo:article:title',

503 + 'weibo:webpage:title',

504 + 'title',

505 + 'twitter:title'

506 + ], array_keys($values)));

507 +

508 + if (isset($this->jsonld['title'])) {

509 + $this->setTitle($this->jsonld['title']);

510 + } else {

511 + $this->setTitle(isset($values[$key]) ? trim($values[$key]) : null);

512 + }

513 +

514 + if (!$this->getTitle()) {

515 + $this->setTitle($this->getArticleTitle());

516 + }

517 +

518 + // get author

519 + $key = current(array_intersect([

520 + 'dc:creator',

521 + 'dcterm:creator',

522 + 'author'

523 + ], array_keys($values)));

524 +

525 + if (isset($this->jsonld['byline'])) {

526 + $this->setAuthor($this->jsonld['byline']);

527 + } else {

528 + $this->setAuthor(isset($values[$key]) ? $values[$key] : null);

529 + }

530 +

531 + // get description

532 + $key = current(array_intersect([

533 + 'dc:description',

534 + 'dcterm:description',

535 + 'og:description',

536 + 'weibo:article:description',

537 + 'weibo:webpage:description',

538 + 'description',

539 + 'twitter:description'

540 + ], array_keys($values)));

541 +

542 + if (isset($this->jsonld['excerpt'])) {

543 + $this->setExcerpt($this->jsonld['excerpt']);

544 + } else {

545 + $this->setExcerpt(isset($values[$key]) ? $values[$key] : null);

546 + }

547 +

548 + // get main image

549 + $key = current(array_intersect([

550 + 'image',

551 + 'og:image',

552 + 'twitter:image'

553 + ], array_keys($values)));

554 +

555 + $this->setImage(isset($values[$key]) ? $values[$key] : null);

556 +

557 + $key = current(array_intersect([

558 + 'og:site_name'

559 + ], array_keys($values)));

560 +

561 + if (isset($this->jsonld['siteName'])) {

562 + $this->setSiteName($this->jsonld['siteName']);

563 + } else {

564 + $this->setSiteName(isset($values[$key]) ? $values[$key] : null);

565 + }

566 +

567 + // in many sites the meta value is escaped with HTML entities,

568 + // so here we need to unescape it

569 + $this->setTitle($this->unescapeHtmlEntities($this->getTitle()));

570 + $this->setAuthor($this->unescapeHtmlEntities($this->getAuthor()));

571 + $this->setExcerpt($this->unescapeHtmlEntities($this->getExcerpt()));

572 + $this->setSiteName($this->unescapeHtmlEntities($this->getSiteName()));

573 + }

574 +

575 + /**

576 + * Returns all the images of the parsed article.

577 + *

578 + * @return array

579 + */

580 + public function getImages()

581 + {

582 + $result = [];

583 + if ($this->getImage()) {

584 + $result[] = $this->getImage();

585 + }

586 +

587 + if (null == $this->getDOMDocument()) {

588 + return $result;

589 + }

590 +

591 + foreach ($this->getDOMDocument()->getElementsByTagName('img') as $img) {

592 + if ($src = $img->getAttribute('src')) {

593 + $result[] = $src;

594 + }

595 + }

596 +

597 + if ($this->configuration->getFixRelativeURLs()) {

598 + foreach ($result as &$imgSrc) {

599 + $imgSrc = $this->toAbsoluteURI($imgSrc);

600 + }

601 + }

602 +

603 + $result = array_unique(array_filter($result));

604 +

605 + return $result;

606 + }

607 +

608 + /**

609 + * Tries to get the main article image. Will only update the metadata if the getMetadata function couldn't

610 + * find a correct image.

611 + */

612 + public function getMainImage()

613 + {

614 + $imgUrl = false;

615 +

616 + if ($this->getImage() !== null) {

617 + $imgUrl = $this->getImage();

618 + }

619 +

620 + if (!$imgUrl) {

621 + foreach ($this->dom->getElementsByTagName('link') as $link) {

622 + /** @var \DOMElement $link */

623 + /*

624 + * Check for the rel attribute, then check if the rel attribute is either img_src or image_src, and

625 + * finally check for the existence of the href attribute, which should hold the image url.

626 + */

627 + if ($link->hasAttribute('rel') && ($link->getAttribute('rel') === 'img_src' || $link->getAttribute('rel') === 'image_src') && $link->hasAttribute('href')) {

628 + $imgUrl = $link->getAttribute('href');

629 + break;

630 + }

631 + }

632 + }

633 +

634 + if (!empty($imgUrl) && $this->configuration->getFixRelativeURLs()) {

635 + $this->setImage($this->toAbsoluteURI($imgUrl));

636 + }

637 + }

638 +

639 + /**

640 + * Remove unnecessary nested elements

641 + *

642 + * @param DOMDocument $article

643 + *

644 + * @return void

645 + */

646 + private function simplifyNestedElements(DOMDocument $article)

647 + {

648 + $node = $article;

649 +

650 + while ($node) {

651 + if ($node->parentNode && in_array($node->nodeName, ['div', 'section']) && !($node->hasAttribute('id') && strpos($node->getAttribute('id'), 'readability') === 0)) {

652 + if ($node->isElementWithoutContent()) {

653 + $node = NodeUtility::removeAndGetNext($node);

654 + continue;

655 + } elseif ($node->hasSingleTagInsideElement('div') || $node->hasSingleTagInsideElement('section')) {

656 + $child = $node->children()->item(0);

657 + for ($i = 0; $i < $node->attributes->length; $i++) {

658 + $child->setAttribute($node->attributes->item($i)->name, $node->attributes->item($i)->value);

659 + }

660 + $node->parentNode->replaceChild($child, $node);

661 + $node = $child;

662 + continue;

663 + }

664 + }

665 +

666 + $node = NodeUtility::getNextNode($node);

667 + }

668 + }

669 +

670 + /**

671 + * Returns the title of the html. Prioritizes the title from the metadata against the title tag.

672 + *

673 + * @return string|null

674 + */

675 + private function getArticleTitle()

676 + {

677 + $originalTitle = null;

678 +

679 + if ($this->getTitle()) {

680 + $originalTitle = $this->getTitle();

681 + } else {

682 + $this->logger->debug('[Metadata] Could not find title in metadata, searching for the title tag...');

683 + $titleTag = $this->dom->getElementsByTagName('title');

684 + if ($titleTag->length > 0) {

685 + $this->logger->info(sprintf('[Metadata] Using title tag as article title: \'%s\'', $titleTag->item(0)->nodeValue));

686 + $originalTitle = $titleTag->item(0)->nodeValue;

687 + }

688 + }

689 +

690 + if ($originalTitle === null) {

691 + return null;

692 + }

693 +

694 + $curTitle = $originalTitle = trim($originalTitle);

695 + $titleHadHierarchicalSeparators = false;

696 +

697 + /*

698 + * If there's a separator in the title, first remove the final part

699 + *

700 + * Sanity warning: if you eval this match in PHPStorm's "Evaluate expression" box, it will return false

701 + * I can assure you it works properly if you let the code run.

702 + */

703 + if (preg_match('/ [\|\-\\\\\/>»] /i', $curTitle)) {

704 + $titleHadHierarchicalSeparators = (bool) preg_match('/ [\\\\\/>»] /', $curTitle);

705 + $curTitle = preg_replace('/(.*)[\|\-\\\\\/>»] .*/i', '$1', $originalTitle);

706 +

707 + $this->logger->info(sprintf('[Metadata] Found hierarchical separators in title, new title is: \'%s\'', $curTitle));

708 +

709 + // If the resulting title is too short (3 words or fewer), remove

710 + // the first part instead:

711 + if (count(preg_split('/\s+/', $curTitle)) < 3) {

712 + $curTitle = preg_replace('/[^\|\-\\\\\/>»]*[\|\-\\\\\/>»](.*)/i', '$1', $originalTitle);

713 + $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));

714 + }

715 + } elseif (strpos($curTitle, ': ') !== false) {

716 + // Check if we have an heading containing this exact string, so we

717 + // could assume it's the full title.

718 + $match = false;

719 + for ($i = 1; $i <= 2; $i++) {

720 + foreach ($this->dom->getElementsByTagName('h' . $i) as $hTag) {

721 + // Trim texts to avoid having false negatives when the title is surrounded by spaces or tabs

722 + if (trim($hTag->nodeValue) === trim($curTitle)) {

723 + $match = true;

724 + }

725 + }

726 + }

727 +

728 + // If we don't, let's extract the title out of the original title string.

729 + if (!$match) {

730 + $curTitle = substr($originalTitle, strrpos($originalTitle, ':') + 1);

731 +

732 + $this->logger->info(sprintf('[Metadata] Title has a colon in the middle, new title is: \'%s\'', $curTitle));

733 +

734 + // If the title is now too short, try the first colon instead:

735 + if (count(preg_split('/\s+/', $curTitle)) < 3) {

736 + $curTitle = substr($originalTitle, strpos($originalTitle, ':') + 1);

737 + $this->logger->info(sprintf('[Metadata] Title too short, using the first part of the title instead: \'%s\'', $curTitle));

738 + } elseif (count(preg_split('/\s+/', substr($curTitle, 0, strpos($curTitle, ':')))) > 5) {

739 + // But if we have too many words before the colon there's something weird

740 + // with the titles and the H tags so let's just use the original title instead

741 + $curTitle = $originalTitle;

742 + }

743 + }

744 + } elseif (readability_mb_strlen($curTitle) > 150 || readability_mb_strlen($curTitle) < 15) {

745 + $hOnes = $this->dom->getElementsByTagName('h1');

746 +

747 + if ($hOnes->length === 1) {

748 + $curTitle = $hOnes->item(0)->nodeValue;

749 + $this->logger->info(sprintf('[Metadata] Using title from an H1 node: \'%s\'', $curTitle));

750 + }

751 + }

752 +

753 + $curTitle = preg_replace(NodeUtility::$regexps['normalize'], ' ', trim($curTitle));

754 +

755 + /*

756 + * If we now have 4 words or fewer as our title, and either no

757 + * 'hierarchical' separators (\, /, > or ») were found in the original

758 + * title or we decreased the number of words by more than 1 word, use

759 + * the original title.

760 + */

761 + $curTitleWordCount = count(preg_split('/\s+/', $curTitle));

762 + $originalTitleWordCount = count(preg_split('/\s+/', preg_replace('/[\|\-\\\\\/>»]+/', '', $originalTitle))) - 1;

763 +

764 + if ($curTitleWordCount <= 4 &&

765 + (!$titleHadHierarchicalSeparators || $curTitleWordCount !== $originalTitleWordCount)) {

766 + $curTitle = $originalTitle;

767 +

768 + $this->logger->info(sprintf('Using title from an H1 node: \'%s\'', $curTitle));

769 + }

770 +

771 + return $curTitle;

772 + }

773 +

774 + /**

775 + * Convert URI to an absolute URI.

776 + *

777 + * @param $uri string URI to convert

778 + *

779 + * @return string

780 + */

781 + private function toAbsoluteURI($uri)

782 + {

783 + list($pathBase, $scheme, $prePath) = $this->getPathInfo($this->configuration->getOriginalURL());

784 +

785 + $uri = trim($uri);

786 +

787 + // If this is already an absolute URI, return it.

788 + if (preg_match('/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/', $uri)) {

789 + return $uri;

790 + }

791 +

792 + // Scheme-rooted relative URI.

793 + if (substr($uri, 0, 2) === '//') {

794 + return $scheme . '://' . substr($uri, 2);

795 + }

796 +

797 + // Prepath-rooted relative URI.

798 + if (substr($uri, 0, 1) === '/') {

799 + return $prePath . $uri;

800 + }

801 +

802 + // Ignore hash URIs:

803 + if (substr($uri, 0, 1) === '#') {

804 + return $uri;

805 + }

806 +

807 + // Dotslash relative URI.

808 + //if (strpos($uri, './') === 0) {

809 + // return $pathBase . substr($uri, 2);

810 + //}

811 +

812 + $baseUri = Http::createFromString($pathBase);

813 + $relativeUri = Http::createFromString($uri);

814 + return (string)UriResolver::resolve($relativeUri, $baseUri);

815 +

816 + // Standard relative URI; add entire path. pathBase already includes a

817 + // trailing "/".

818 + //return $pathBase . $uri;

819 + }

820 +

821 + /**

822 + * Returns full path info of an URL.

823 + *

824 + * @param string $url

825 + *

826 + * @return array [$pathBase, $scheme, $prePath]

827 + */

828 + public function getPathInfo($url)

829 + {

830 + // Check for base URLs

831 + if ($this->baseURI !== null) {

832 + if (substr($this->baseURI, 0, 1) === '/') {

833 + // URLs starting with '/' override completely the URL defined in the link

834 + $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . $this->baseURI;

835 + } else {

836 + // Otherwise just prepend the base to the actual path

837 + $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/'.rtrim($this->baseURI, '/') . '/';

838 + }

839 + } else {

840 + $pathBase = parse_url($url, PHP_URL_SCHEME) . '://' . parse_url($url, PHP_URL_HOST) . dirname(parse_url($url, PHP_URL_PATH)) . '/';

841 + }

842 +

843 + $scheme = parse_url($pathBase, PHP_URL_SCHEME);

844 + $prePath = $scheme . '://' . parse_url($pathBase, PHP_URL_HOST);

845 +

846 + return [$pathBase, $scheme, $prePath];

847 + }

848 +

849 + /**

850 + * Gets nodes from the root element.

851 + *

852 + * @param $node DOMNode|DOMText

853 + *

854 + * @return array

855 + */

856 + private function getNodes($node)

857 + {

858 + $this->logger->info('[Get Nodes] Retrieving nodes...');

859 +

860 + $stripUnlikelyCandidates = $this->configuration->getStripUnlikelyCandidates();

861 +

862 + $elementsToScore = [];

863 +

864 + $shouldRemoveTitleHeader = true;

865 +

866 + /*

867 + * First, node prepping. Trash nodes that look cruddy (like ones with the

868 + * class name "comment", etc), and turn divs into P tags where they have been

869 + * used inappropriately (as in, where they contain no other block level elements.)

870 + */

871 +

872 + while ($node) {

873 + // Remove DOMComments nodes as we don't need them and mess up children counting

874 + if ($node->nodeType === XML_COMMENT_NODE) {

875 + $this->logger->debug(sprintf('[Get Nodes] Found comment node, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));

876 + $node = NodeUtility::removeAndGetNext($node);

877 + continue;

878 + }

879 +

880 + $matchString = $node->getAttribute('class') . ' ' . $node->getAttribute('id');

881 +

882 + if (!$node->isProbablyVisible()) {

883 + $this->logger->debug(sprintf('[Get Nodes] Removing hidden node... Match string was: \'%s\'', $matchString));

884 + $node = NodeUtility::removeAndGetNext($node);

885 + continue;

886 + }

887 +

888 + // Check to see if this node is a byline, and remove it if it is.

889 + if ($this->checkByline($node, $matchString)) {

890 + $this->logger->debug(sprintf('[Get Nodes] Found byline, removing... Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));

891 + $node = NodeUtility::removeAndGetNext($node);

892 + continue;

893 + }

894 +

895 + if ($shouldRemoveTitleHeader && $this->headerDuplicatesTitle($node)) {

896 + $this->logger->debug(sprintf('Removing header: %s', $node->getTextContent()));

897 + $shouldRemoveTitleHeader = false;

898 + $node = NodeUtility::removeAndGetNext($node);

899 + continue;

900 + }

901 +

902 + // Remove unlikely candidates

903 + if ($stripUnlikelyCandidates) {

904 + if (

905 + preg_match(NodeUtility::$regexps['unlikelyCandidates'], $matchString) &&

906 + !preg_match(NodeUtility::$regexps['okMaybeItsACandidate'], $matchString) &&

907 + !$node->hasAncestorTag( 'table') &&

908 + !$node->hasAncestorTag( 'code') &&

909 + $node->nodeName !== 'body' &&

910 + $node->nodeName !== 'a'

911 + ) {

912 + $this->logger->debug(sprintf('[Get Nodes] Removing unlikely candidate. Node content was: \'%s\'', substr($node->nodeValue, 0, 128)));

913 + $node = NodeUtility::removeAndGetNext($node);

914 + continue;

915 + }

916 + }

917 +

918 + if (in_array($node->getAttribute('role'), $this->unlikelyRoles)) {

919 + $this->logger->debug(sprintf('Removing content with role %s - %s', $node->getAttribute('role'), $matchString));

920 + $node = NodeUtility::removeAndGetNext($node);

921 + continue;

922 + }

923 +

924 + // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).

925 + if (($node->nodeName === 'div' || $node->nodeName === 'section' || $node->nodeName === 'header' ||

926 + $node->nodeName === 'h1' || $node->nodeName === 'h2' || $node->nodeName === 'h3' ||

927 + $node->nodeName === 'h4' || $node->nodeName === 'h5' || $node->nodeName === 'h6' ||

928 + $node->nodeName === 'p') &&

929 + $node->isElementWithoutContent()) {

930 + $this->logger->debug(sprintf('[Get Nodes] Removing empty \'%s\' node.', $node->nodeName));

931 + $node = NodeUtility::removeAndGetNext($node);

932 + continue;

933 + }

934 +

935 + if (in_array(strtolower($node->nodeName), $this->defaultTagsToScore)) {

936 + $this->logger->debug(sprintf('[Get Nodes] Adding node to score list, node content is: \'%s\'', substr($node->nodeValue, 0, 128)));

937 + $elementsToScore[] = $node;

938 + }

939 +

940 + // Turn all divs that don't have children block level elements into p's

941 + if ($node->nodeName === 'div') {

942 + // Put phrasing content into paragraphs.

943 + $p = null;

944 + $childNode = $node->firstChild;

945 + while ($childNode) {

946 + $nextSibling = $childNode->nextSibling;

947 + if ($childNode->isPhrasingContent()) {

948 + if ($p !== null) {

949 + $p->appendChild($childNode);

950 + } elseif (!$childNode->isWhitespace()) {

951 + $p = $this->dom->createElement('p');

952 + $node->replaceChild($p, $childNode);

953 + $p->appendChild($childNode);

954 + }

955 + } elseif ($p !== null) {

956 + while ($p->lastChild && $p->lastChild->isWhitespace()) {

957 + $p->removeChild($p->lastChild);

958 + }

959 + $p = null;

960 + }

961 + $childNode = $nextSibling;

962 + }

963 +

964 + /*

965 + * Sites like http://mobile.slate.com encloses each paragraph with a DIV

966 + * element. DIVs with only a P element inside and no text content can be

967 + * safely converted into plain P elements to avoid confusing the scoring

968 + * algorithm with DIVs with are, in practice, paragraphs.

969 + */

970 + if ($node->hasSingleTagInsideElement('p') && $node->getLinkDensity() < 0.25) {

971 + $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single P node, removing DIV. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));

972 + $pNode = NodeUtility::filterTextNodes($node->childNodes)->item(0);

973 + $node->parentNode->replaceChild($pNode, $node);

974 + $node = $pNode;

975 + $elementsToScore[] = $node;

976 + } elseif (!$node->hasSingleChildBlockElement()) {

977 + $this->logger->debug(sprintf('[Get Nodes] Found DIV with a single child block element, converting to a P node. Node content is: \'%s\'', substr($node->nodeValue, 0, 128)));

978 + $node = NodeUtility::setNodeTag($node, 'p');

979 + $elementsToScore[] = $node;

980 + }

981 + }

982 +

983 + $node = NodeUtility::getNextNode($node);

984 + }

985 +

986 + return $elementsToScore;

987 + }

988 +

989 + /**

990 + * compares second text to first one

991 + * 1 = same text, 0 = completely different text

992 + * works the way that it splits both texts into words and then finds words that are unique in second text

993 + * the result is given by the lower length of unique parts

994 + *

995 + * @param string $textA

996 + * @param string $textB

997 + *

998 + * @return int 1 = same text, 0 = completely different text

999 + */

1000 + private function textSimilarity(string $textA, string $textB) {

1001 + $tokensA = array_filter(preg_split(NodeUtility::$regexps['tokenize'], readability_mb_strtolower($textA)));

1002 + $tokensB = array_filter(preg_split(NodeUtility::$regexps['tokenize'], readability_mb_strtolower($textB)));

1003 + if (!count($tokensA) || !count($tokensB)) {

1004 + return 0;

1005 + }

1006 + $uniqTokensB = array_filter($tokensB, function ($token) use (&$tokensA) {

1007 + return !in_array($token, $tokensA);

1008 + });

1009 + $slen = readability_mb_strlen(implode(' ', $tokensB));

1010 + if($slen == 0)

1011 + {

1012 + return 0;

1013 + }

1014 + $distanceB = readability_mb_strlen(implode(' ', $uniqTokensB)) / $slen;

1015 + return 1 - $distanceB;

1016 + }

1017 +

1018 + /**

1019 + * Checks if the node is a byline.

1020 + *

1021 + * @param DOMNode $node

1022 + * @param string $matchString

1023 + *

1024 + * @return bool

1025 + */

1026 + private function checkByline($node, $matchString)

1027 + {

1028 + if (!$this->configuration->getArticleByLine()) {

1029 + return false;

1030 + }

1031 +

1032 + /*

1033 + * Check if the byline is already set

1034 + */

1035 + if ($this->getAuthor()) {

1036 + return false;

1037 + }

1038 +

1039 + $rel = $node->getAttribute('rel');

1040 + $itemprop = $node->getAttribute("itemprop");

1041 +

1042 + if ($rel === 'author' || ($itemprop && strpos($itemprop, 'author') !== false) || preg_match(NodeUtility::$regexps['byline'], $matchString) && $this->isValidByline($node->getTextContent(false))) {

1043 + $this->logger->info(sprintf('[Metadata] Found article author: \'%s\'', $node->getTextContent(false)));

1044 + $this->setAuthor(trim($node->getTextContent(false)));

1045 +

1046 + return true;

1047 + }

1048 +

1049 + return false;

1050 + }

1051 +

1052 + /**

1053 + * Checks the validity of a byLine. Based on string length.

1054 + *

1055 + * @param string $text

1056 + *

1057 + * @return bool

1058 + */

1059 + private function isValidByline($text)

1060 + {

1061 + if (gettype($text) == 'string') {

1062 + $byline = trim($text);

1063 +

1064 + return (readability_mb_strlen($byline) > 0) && (readability_mb_strlen($byline) < 100);

1065 + }

1066 +

1067 + return false;

1068 + }

1069 +

1070 + /**

1071 + * Converts some of the common HTML entities in string to their corresponding characters.

1072 + *

1073 + * @param string $str - a string to unescape.

1074 + * @return string without HTML entity.

1075 + */

1076 + private function unescapeHtmlEntities($str) {

1077 + if (!$str) {

1078 + return $str;

1079 + }

1080 +

1081 + $htmlEscapeMap = $this->htmlEscapeMap;

1082 + $str = preg_replace_callback('/&(quot|amp|apos|lt|gt);/', function ($tag) use ($htmlEscapeMap) {

1083 + return $htmlEscapeMap[$tag[1]];

1084 + }, $str);

1085 + $str = preg_replace_callback('/&#(?:x([0-9a-z]{1,4})|([0-9]{1,4}));/i', function ($matches) {

1086 + $hex = $matches[1];

1087 + $numStr = $matches[2];

1088 + if ($hex !== '') {

1089 + $num = intval($hex, 16);

1090 + } else {

1091 + $num = intval($numStr, 10);

1092 + }

1093 + return readability_mb_chr($num);

1094 + }, $str);

1095 + return $str;

1096 + }

1097 +

1098 + /**

1099 + * Check if node is image, or if node contains exactly only one image

1100 + * whether as a direct child or as its descendants.

1101 + *

1102 + * @param DOMElement $node

1103 + */

1104 + private function isSingleImage(DOMElement $node) {

1105 + if ($node->tagName === 'img') {

1106 + return true;

1107 + }

1108 +

1109 + if ($node->children()->length !== 1 || trim($node->textContent) !== '') {

1110 + return false;

1111 + }

1112 +

1113 + return $this->isSingleImage($node->children()->item(0));

1114 + }

1115 +

1116 + /**

1117 + * Find all <noscript> that are located after <img> nodes, and which contain only one

1118 + * <img> element. Replace the first image with the image from inside the <noscript> tag,

1119 + * and remove the <noscript> tag. This improves the quality of the images we use on

1120 + * some sites (e.g. Medium).

1121 + *

1122 + * @param DOMDocument $dom

1123 + */

1124 + private function unwrapNoscriptImages(DOMDocument $dom) {

1125 + // Find img without source or attributes that might contains image, and remove it.

1126 + // This is done to prevent a placeholder img is replaced by img from noscript in next step.

1127 + $imgs = iterator_to_array($dom->getElementsByTagName('img'));

1128 + array_walk($imgs, function ($img) {

1129 + for ($i = 0; $i < $img->attributes->length; $i++) {

1130 + $attr = $img->attributes->item($i);

1131 + switch ($attr->name) {

1132 + case 'src':

1133 + case 'srcset':

1134 + case 'data-src':

1135 + case 'data-srcset':

1136 + return;

1137 + }

1138 +

1139 + if (preg_match('/\.(jpg|jpeg|png|webp)/i', $attr->value)) {

1140 + return;

1141 + }

1142 + }

1143 +

1144 + $img->parentNode->removeChild($img);

1145 + });

1146 +

1147 + // Next find noscript and try to extract its image

1148 + $noscripts = iterator_to_array($dom->getElementsByTagName('noscript'));

1149 + array_walk($noscripts, function ($noscript) use($dom) {

1150 + // Parse content of noscript and make sure it only contains image

1151 + // [PHP port] Could copy innerHTML support over for the commented lines below, but is it needed?

1152 + // var tmp = doc.createElement("div");

1153 + // tmp.innerHTML = noscript.innerHTML;

1154 + $tmp = $noscript->cloneNode(true);

1155 + $dom->importNode($tmp);

1156 + if (!$this->isSingleImage($tmp)) {

1157 + return;

1158 + }

1159 +

1160 + // If noscript has previous sibling and it only contains image,

1161 + // replace it with noscript content. However we also keep old

1162 + // attributes that might contains image.

1163 + $prevElement = $noscript->previousElementSibling();

1164 + if ($prevElement && $this->isSingleImage($prevElement)) {

1165 + $prevImg = $prevElement;

1166 + if ($prevImg->tagName !== 'img') {

1167 + $prevImg = $prevElement->getElementsByTagName('img')->item(0);

1168 + }

1169 +

1170 + $newImg = $tmp->getElementsByTagName('img')->item(0);

1171 + for ($i = 0; $i < $prevImg->attributes->length; $i++) {

1172 + $attr = $prevImg->attributes->item($i);

1173 + if ($attr->value === '') {

1174 + continue;

1175 + }

1176 +

1177 + if ($attr->name === 'src' || $attr->name === 'srcset' || preg_match('/\.(jpg|jpeg|png|webp)/i', $attr->value)) {

1178 + if ($newImg->getAttribute($attr->name) === $attr->value) {

1179 + continue;

1180 + }

1181 +

1182 + $attrName = $attr->name;

1183 + if ($newImg->hasAttribute($attrName)) {

1184 + $attrName = 'data-old-' . $attrName;

1185 + }

1186 +

1187 + $newImg->setAttribute($attrName, $attr->value);

1188 + }

1189 + }

1190 +

1191 + $noscript->parentNode->replaceChild($tmp->getFirstElementChild(), $prevElement);

1192 + }

1193 + });

1194 + }

1195 +

1196 + /**

1197 + * Removes all the scripts of the html.

1198 + *

1199 + * @param DOMDocument $dom

1200 + */

1201 + private function removeScripts(DOMDocument $dom)

1202 + {

1203 + foreach (['script', 'noscript'] as $tag) {

1204 + $nodes = $dom->getElementsByTagName($tag);

1205 + foreach (iterator_to_array($nodes) as $node) {

1206 + NodeUtility::removeNode($node);

1207 + }

1208 + }

1209 + }

1210 +

1211 + /**

1212 + * Prepares the document for parsing.

1213 + *

1214 + * @param DOMDocument $dom

1215 + */

1216 + private function prepDocument(DOMDocument $dom)

1217 + {

1218 + $this->logger->info('[PrepDocument] Preparing document for parsing...');

1219 +

1220 + foreach ($dom->shiftingAwareGetElementsByTagName('br') as $br) {

1221 + $next = $br->nextSibling;

1222 +

1223 + /*

1224 + * Whether 2 or more elements have been found and replaced with a

1225 + * block.

1226 + */

1227 + $replaced = false;

1228 +

1229 + /*

1230 + * If we find a chain, remove the s until we hit another element

1231 + * or non-whitespace. This leaves behind the first in the chain

1232 + * (which will be replaced with a later).

1233 + */

1234 + while (($next = NodeUtility::nextNode($next)) && ($next->nodeName === 'br')) {

1235 + $this->logger->debug('[PrepDocument] Removing chain of BR nodes...');

1236 +

1237 + $replaced = true;

1238 + $brSibling = $next->nextSibling;

1239 + $next->parentNode->removeChild($next);

1240 + $next = $brSibling;

1241 + }

1242 +

1243 + /*

1244 + * If we removed a chain, replace the remaining with a . Add

1245 + * all sibling nodes as children of the until we hit another

1246 + * chain.

1247 + */

1248 +

1249 + if ($replaced) {

1250 + $p = $dom->createElement('p');

1251 + $br->parentNode->replaceChild($p, $br);

1252 +

1253 + $next = $p->nextSibling;

1254 + while ($next) {

1255 + // If we've hit another , we're done adding children to this .

1256 + if ($next->nodeName === 'br') {

1257 + $nextElem = NodeUtility::nextNode($next->nextSibling);

1258 + if ($nextElem && $nextElem->nodeName === 'br') {

1259 + break;

1260 + }

1261 + }

1262 +

1263 + if (!$next->isPhrasingContent()) {

1264 + break;

1265 + }

1266 +

1267 + $this->logger->debug('[PrepDocument] Replacing BR with a P node...');

1268 +

1269 + // Otherwise, make this node a child of the new .

1270 + $sibling = $next->nextSibling;

1271 + $p->appendChild($next);

1272 + $next = $sibling;

1273 + }

1274 +

1275 + while ($p && $p->lastChild && $p->lastChild->isWhitespace()) {

1276 + $p->removeChild($p->lastChild);

1277 + }

1278 +

1279 + while ($p && $p->firstChild && $p->firstChild->isWhitespace()) {

1280 + $p->removeChild($p->firstChild);

1281 + }

1282 +

1283 + if ($p->parentNode->tagName === 'p') {

1284 + NodeUtility::setNodeTag($p->parentNode, 'div');

1285 + }

1286 + }

1287 + }

1288 +

1289 + // Replace font tags with span

1290 + $fonts = $this->_getAllNodesWithTag($dom, ['font']);

1291 + $length = count($fonts);

1292 + for ($i = 0; $i < $length; $i++) {

1293 + $this->logger->debug('[PrepDocument] Converting font tag into a span tag.');

1294 + $font = $fonts[$length - 1 - $i];

1295 + NodeUtility::setNodeTag($font, 'span');

1296 + }

1297 + }

1298 +

1299 + /**

1300 + * Assign scores to each node. Returns full article parsed or false on error.

1301 + *

1302 + * @param array $nodes

1303 + *

1304 + * @return DOMDocument|bool

1305 + */

1306 + private function rateNodes($nodes)

1307 + {

1308 + $this->logger->info('[Rating] Rating nodes...');

1309 +

1310 + $candidates = [];

1311 +

1312 + /** @var DOMElement $node */

1313 + foreach ($nodes as $node) {

1314 + if (is_null($node->parentNode)) {

1315 + continue;

1316 + }

1317 +

1318 + // Discard nodes with less than 25 characters, without blank space

1319 + if (readability_mb_strlen($node->getTextContent(true)) < 25) {

1320 + continue;

1321 + }

1322 +

1323 + $ancestors = $node->getNodeAncestors(5);

1324 +

1325 + // Exclude nodes with no ancestor

1326 + if (count($ancestors) === 0) {

1327 + continue;

1328 + }

1329 +

1330 + // Start with a point for the paragraph itself as a base.

1331 + $contentScore = 1;

1332 +

1333 + // Add points for any commas within this paragraph.

1334 + $contentScore += count(explode(',', $node->getTextContent(true)));

1335 +

1336 + // For every 100 characters in this paragraph, add another point. Up to 3 points.

1337 + $contentScore += min(floor(readability_mb_strlen($node->getTextContent(true)) / 100), 3);

1338 +

1339 + $this->logger->debug(sprintf('[Rating] Node score %s, content: \'%s\'', $contentScore, substr($node->nodeValue, 0, 128)));

1340 +

1341 + foreach ($ancestors as $level => $ancestor) {

1342 + $this->logger->debug('[Rating] Found ancestor, initializing and adding it as a candidate...');

1343 + if (!$ancestor->isInitialized()) {

1344 + $ancestor->initializeNode($this->configuration->getWeightClasses());

1345 + $candidates[] = $ancestor;

1346 + }

1347 +

1348 + /*

1349 + * Node score divider:

1350 + * - parent: 1 (no division)

1351 + * - grandparent: 2

1352 + * - great grandparent+: ancestor level * 3

1353 + */

1354 +

1355 + if ($level === 0) {

1356 + $scoreDivider = 1;

1357 + } elseif ($level === 1) {

1358 + $scoreDivider = 2;

1359 + } else {

1360 + $scoreDivider = $level * 3;

1361 + }

1362 +

1363 + $currentScore = $ancestor->contentScore;

1364 + $ancestor->contentScore = $currentScore + ($contentScore / $scoreDivider);

1365 +

1366 + $this->logger->debug(sprintf('[Rating] Ancestor score %s, value: \'%s\'', $ancestor->contentScore, substr($ancestor->nodeValue, 0, 128)));

1367 + }

1368 + }

1369 +

1370 + /*

1371 + * After we've calculated scores, loop through all of the possible

1372 + * candidate nodes we found and find the one with the highest score.

1373 + */

1374 +

1375 + $topCandidates = [];

1376 + foreach ($candidates as $candidate) {

1377 +

1378 + /*

1379 + * Scale the final candidates score based on link density. Good content

1380 + * should have a relatively small link density (5% or less) and be mostly

1381 + * unaffected by this operation.

1382 + */

1383 +

1384 + $candidate->contentScore = $candidate->contentScore * (1 - $candidate->getLinkDensity());

1385 +

1386 + for ($i = 0; $i < $this->configuration->getMaxTopCandidates(); $i++) {

1387 + $aTopCandidate = isset($topCandidates[$i]) ? $topCandidates[$i] : null;

1388 +

1389 + if (!$aTopCandidate || $candidate->contentScore > $aTopCandidate->contentScore) {

1390 + array_splice($topCandidates, $i, 0, [$candidate]);

1391 + if (count($topCandidates) > $this->configuration->getMaxTopCandidates()) {

1392 + array_pop($topCandidates);

1393 + }

1394 + break;

1395 + }

1396 + }

1397 + }

1398 +

1399 + $topCandidate = isset($topCandidates[0]) ? $topCandidates[0] : null;

1400 + $parentOfTopCandidate = null;

1401 +

1402 + /*

1403 + * If we still have no top candidate, just use the body as a last resort.

1404 + * We also have to copy the body node so it is something we can modify.

1405 + */

1406 +

1407 + if ($topCandidate === null || $topCandidate->nodeName === 'body') {

1408 + $this->logger->info('[Rating] No top candidate found or top candidate is the body tag. Moving all child nodes to a new DIV node.');

1409 +

1410 + // Move all of the page's children into topCandidate

1411 + $topCandidate = new DOMDocument('1.0', 'utf-8');

1412 + $topCandidate->encoding = 'UTF-8';

1413 + $topCandidate->appendChild($topCandidate->createElement('div', ''));

1414 + $kids = $this->dom->getElementsByTagName('body')->item(0)->childNodes;

1415 +

1416 + // Cannot be foreached, don't ask me why.

1417 + for ($i = 0; $i < $kids->length; $i++) {

1418 + $import = $topCandidate->importNode($kids->item($i), true);

1419 + $topCandidate->firstChild->appendChild($import);

1420 + }

1421 +

1422 + // Candidate must be created using firstChild to grab the DOMElement instead of the DOMDocument.

1423 + $topCandidate = $topCandidate->firstChild;

1424 + } elseif ($topCandidate) {

1425 + $this->logger->info(sprintf('[Rating] Found top candidate, score: %s', $topCandidate->contentScore));

1426 + // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array

1427 + // and whose scores are quite closed with current `topCandidate` node.

1428 + $alternativeCandidateAncestors = [];

1429 + for ($i = 1; $i < count($topCandidates); $i++) {

1430 + // In some cases we may end up with a top candidate with zero content score. To avoid dividing by zero

1431 + // we have to use max() and replace zero with a low value like 0.1

1432 + if ($topCandidates[$i]->contentScore / max($topCandidate->contentScore, 0.1) >= 0.75) {

1433 + array_push($alternativeCandidateAncestors, $topCandidates[$i]->getNodeAncestors(false));

1434 + }

1435 + }

1436 +

1437 + $MINIMUM_TOPCANDIDATES = 3;

1438 + if (count($alternativeCandidateAncestors) >= $MINIMUM_TOPCANDIDATES) {

1439 + $parentOfTopCandidate = $topCandidate->parentNode;

1440 +

1441 + // Check if we are actually dealing with a DOMNode and not a DOMDocument node or higher

1442 + while ($parentOfTopCandidate && $parentOfTopCandidate->nodeName !== 'body' && $parentOfTopCandidate->nodeType === XML_ELEMENT_NODE) {

1443 + $listsContainingThisAncestor = 0;

1444 + for ($ancestorIndex = 0; $ancestorIndex < count($alternativeCandidateAncestors) && $listsContainingThisAncestor < $MINIMUM_TOPCANDIDATES; $ancestorIndex++) {

1445 + $listsContainingThisAncestor += (int)in_array($parentOfTopCandidate, $alternativeCandidateAncestors[$ancestorIndex]);

1446 + }

1447 + if ($listsContainingThisAncestor >= $MINIMUM_TOPCANDIDATES) {

1448 + $topCandidate = $parentOfTopCandidate;

1449 + break;

1450 + }

1451 + $parentOfTopCandidate = $parentOfTopCandidate->parentNode;

1452 + }

1453 + }

1454 +

1455 + /*

1456 + * Because of our bonus system, parents of candidates might have scores

1457 + * themselves. They get half of the node. There won't be nodes with higher

1458 + * scores than our topCandidate, but if we see the score going *up* in the first

1459 + * few steps up the tree, that's a decent sign that there might be more content

1460 + * lurking in other places that we want to unify in. The sibling stuff

1461 + * below does some of that - but only if we've looked high enough up the DOM

1462 + * tree.

1463 + */

1464 +

1465 + $parentOfTopCandidate = $topCandidate->parentNode;

1466 + $lastScore = $topCandidate->contentScore;

1467 +

1468 + // The scores shouldn't get too low.

1469 + $scoreThreshold = $lastScore / 3;

1470 +

1471 + /* @var DOMElement $parentOfTopCandidate */

1472 + while ($parentOfTopCandidate && $parentOfTopCandidate->nodeName !== 'body') {

1473 + $parentScore = $parentOfTopCandidate->contentScore;

1474 + if ($parentScore < $scoreThreshold) {

1475 + break;

1476 + }

1477 +

1478 + if ($parentScore > $lastScore) {

1479 + // Alright! We found a better parent to use.

1480 + $topCandidate = $parentOfTopCandidate;

1481 + $this->logger->info('[Rating] Found a better top candidate.');

1482 + break;

1483 + }

1484 + $lastScore = $parentOfTopCandidate->contentScore;

1485 + $parentOfTopCandidate = $parentOfTopCandidate->parentNode;

1486 + }

1487 +

1488 + // If the top candidate is the only child, use parent instead. This will help sibling

1489 + // joining logic when adjacent content is actually located in parent's sibling node.

1490 + $parentOfTopCandidate = $topCandidate->parentNode;

1491 + while ($parentOfTopCandidate && $parentOfTopCandidate->nodeName !== 'body' && count(NodeUtility::filterTextNodes($parentOfTopCandidate->childNodes)) === 1) {

1492 + $topCandidate = $parentOfTopCandidate;

1493 + $parentOfTopCandidate = $topCandidate->parentNode;

1494 + }

1495 + }

1496 +

1497 + /*

1498 + * Now that we have the top candidate, look through its siblings for content

1499 + * that might also be related. Things like preambles, content split by ads

1500 + * that we removed, etc.

1501 + */

1502 +

1503 + $this->logger->info('[Rating] Creating final article content document...');

1504 +

1505 + $articleContent = new DOMDocument('1.0', 'utf-8');

1506 + $articleContent->createElement('div');

1507 +

1508 + $siblingScoreThreshold = max(10, $topCandidate->contentScore * 0.2);

1509 + // Keep potential top candidate's parent node to try to get text direction of it later.

1510 + $parentOfTopCandidate = $topCandidate->parentNode;

1511 + $siblings = $parentOfTopCandidate->childNodes;

1512 +

1513 + $hasContent = false;

1514 +

1515 + $this->logger->info('[Rating] Adding top candidate siblings...');

1516 +

1517 + /* @var DOMElement $sibling */

1518 + // Can't foreach here because down there we might change the tag name and that causes the foreach to skip items

1519 + for ($i = 0; $i < $siblings->length; $i++) {

1520 + $sibling = $siblings[$i];

1521 + $append = false;

1522 +

1523 + if ($sibling === $topCandidate) {

1524 + $this->logger->debug('[Rating] Sibling is equal to the top candidate, adding to the final article...');

1525 +

1526 + $append = true;

1527 + } else {

1528 + $contentBonus = 0;

1529 +

1530 + // Give a bonus if sibling nodes and top candidates have the example same classname

1531 + if ($sibling->getAttribute('class') === $topCandidate->getAttribute('class') && $topCandidate->getAttribute('class') !== '') {

1532 + $contentBonus += $topCandidate->contentScore * 0.2;

1533 + }

1534 + if ($sibling->contentScore + $contentBonus >= $siblingScoreThreshold) {

1535 + $append = true;

1536 + } elseif ($sibling->nodeName === 'p') {

1537 + $linkDensity = $sibling->getLinkDensity();

1538 + $nodeContent = $sibling->getTextContent(true);

1539 +

1540 + if (readability_mb_strlen($nodeContent) > 80 && $linkDensity < 0.25) {

1541 + $append = true;

1542 + } elseif ($nodeContent && readability_mb_strlen($nodeContent) < 80 && $linkDensity === 0 && preg_match('/\.( |$)/', $nodeContent)) {

1543 + $append = true;

1544 + }

1545 + }

1546 + }

1547 +

1548 + if ($append) {

1549 + $this->logger->debug(sprintf('[Rating] Appending sibling to final article, content is: \'%s\'', substr($sibling->nodeValue, 0, 128)));

1550 +

1551 + $hasContent = true;

1552 +

1553 + if (!in_array(strtolower($sibling->nodeName), $this->alterToDIVExceptions)) {

1554 + /*

1555 + * We have a node that isn't a common block level element, like a form or td tag.

1556 + * Turn it into a div so it doesn't get filtered out later by accident.

1557 + */

1558 + $sibling = NodeUtility::setNodeTag($sibling, 'div');

1559 + }

1560 +

1561 + $import = $articleContent->importNode($sibling, true);

1562 + $articleContent->appendChild($import);

1563 +

1564 + /*

1565 + * No node shifting needs to be check because when calling getChildren, an array is made with the

1566 + * children of the parent node, instead of using the DOMElement childNodes function, which, when used

1567 + * along with appendChild, would shift the nodes position and the current foreach will behave in

1568 + * unpredictable ways.

1569 + */

1570 + }

1571 + }

1572 +

1573 + $articleContent = $this->prepArticle($articleContent);

1574 +

1575 + if ($hasContent) {

1576 + // Find out text direction from ancestors of final top candidate.

1577 + $ancestors = array_merge([$parentOfTopCandidate, $topCandidate], $parentOfTopCandidate->getNodeAncestors());

1578 + foreach ($ancestors as $ancestor) {

1579 + $articleDir = $ancestor->getAttribute('dir');

1580 + if ($articleDir) {

1581 + $this->setDirection($articleDir);

1582 + $this->logger->debug(sprintf('[Rating] Found article direction: %s', $articleDir));

1583 + break;

1584 + }

1585 + }

1586 +

1587 + return $articleContent;

1588 + } else {

1589 + return false;

1590 + }

1591 + }

1592 +

1593 + /**

1594 + * Cleans up the final article.

1595 + *

1596 + * @param DOMDocument $article

1597 + *

1598 + * @return DOMDocument

1599 + */

1600 + public function prepArticle(DOMDocument $article)

1601 + {

1602 + $this->logger->info('[PrepArticle] Preparing final article...');

1603 +

1604 + $this->_cleanStyles($article);

1605 + $this->_clean($article, 'style');

1606 +

1607 + // Check for data tables before we continue, to avoid removing items in

1608 + // those tables, which will often be isolated even though they're

1609 + // visually linked to other content-ful elements (text, images, etc.).

1610 + $this->_markDataTables($article);

1611 +

1612 + $this->_fixLazyImages($article);

1613 +

1614 + // Clean out junk from the article content

1615 + $this->_cleanConditionally($article, 'form');

1616 + $this->_cleanConditionally($article, 'fieldset');

1617 + $this->_clean($article, 'object');

1618 + $this->_clean($article, 'embed');

1619 + $this->_clean($article, 'footer');

1620 + $this->_clean($article, 'link');

1621 + $this->_clean($article, 'aside');

1622 +

1623 + // Clean out elements have "share" in their id/class combinations from final top candidates,

1624 + // which means we don't remove the top candidates even they have "share".

1625 +

1626 + $shareElementThreshold = $this->configuration->getCharThreshold();

1627 +

1628 + foreach ($article->childNodes as $child) {

1629 + $this->_cleanMatchedNodes($child, function ($node, $matchString) use ($shareElementThreshold) {

1630 + return (preg_match(NodeUtility::$regexps['shareElements'], $matchString) && readability_mb_strlen($node->textContent) < $shareElementThreshold);

1631 + });

1632 + }

1633 +

1634 + /*

1635 + * If there is only one h2 and its text content substantially equals article title,

1636 + * they are probably using it as a header and not a subheader,

1637 + * so remove it since we already extract the title separately.

1638 + */

1639 + /*

1640 + $h2 = $article->getElementsByTagName('h2');

1641 + if ($h2->length === 1) {

1642 + $lengthSimilarRate = (readability_mb_strlen($h2->item(0)->textContent) - readability_mb_strlen($this->getTitle())) / max(readability_mb_strlen($this->getTitle()), 1);

1643 +

1644 + if (abs($lengthSimilarRate) < 0.5) {

1645 + if ($lengthSimilarRate > 0) {

1646 + $titlesMatch = strpos($h2->item(0)->textContent, $this->getTitle()) !== false;

1647 + } else {

1648 + $titlesMatch = strpos($this->getTitle(), $h2->item(0)->textContent) !== false;

1649 + }

1650 + if ($titlesMatch) {

1651 + $this->logger->info('[PrepArticle] Found title repeated in an H2 node, removing...');

1652 + $this->_clean($article, 'h2');

1653 + }

1654 + }

1655 + }

1656 + */

1657 +

1658 + $this->_clean($article, 'iframe');

1659 + $this->_clean($article, 'input');

1660 + $this->_clean($article, 'textarea');

1661 + $this->_clean($article, 'select');

1662 + $this->_clean($article, 'button');

1663 + $this->_cleanHeaders($article);

1664 +

1665 + // Do these last as the previous stuff may have removed junk

1666 + // that will affect these

1667 + $this->_cleanConditionally($article, 'table');

1668 + $this->_cleanConditionally($article, 'ul');

1669 + $this->_cleanConditionally($article, 'div');

1670 +

1671 + // replace H1 with H2 as H1 should be only title that is displayed separately

1672 + foreach (iterator_to_array($article->getElementsByTagName('h1')) as $h1) {

1673 + NodeUtility::setNodeTag($h1, 'h2');

1674 + }

1675 +

1676 + $this->_cleanExtraParagraphs($article);

1677 +

1678 + foreach (iterator_to_array($article->getElementsByTagName('br')) as $br) {

1679 + $next = NodeUtility::nextNode($br->nextSibling);

1680 + if ($next && $next->nodeName === 'p') {

1681 + $this->logger->debug('[PrepArticle] Removing br node next to a p node.');

1682 + $br->parentNode->removeChild($br);

1683 + }

1684 + }

1685 +

1686 + // Remove single-cell tables

1687 + foreach ($article->shiftingAwareGetElementsByTagName('table') as $table) {

1688 + /** @var DOMNode $table */

1689 + $tbody = $table->hasSingleTagInsideElement('tbody') ? $table->getFirstElementChild() : $table;

1690 + if ($tbody->hasSingleTagInsideElement('tr')) {

1691 + $row = $tbody->getFirstElementChild();

1692 + if ($row->hasSingleTagInsideElement('td')) {

1693 + $cell = $row->getFirstElementChild();

1694 + $cell = NodeUtility::setNodeTag($cell, (array_reduce(iterator_to_array($cell->childNodes), function ($carry, $node) {

1695 + return $node->isPhrasingContent() && $carry;

1696 + }, true)) ? 'p' : 'div');

1697 + $table->parentNode->replaceChild($cell, $table);

1698 + }

1699 + }

1700 + }

1701 +

1702 + return $article;

1703 + }

1704 +

1705 + /**

1706 + * Look for 'data' (as opposed to 'layout') tables, for which we use

1707 + * similar checks as

1708 + * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920.

1709 + *

1710 + * @param DOMDocument $article

1711 + *

1712 + * @return void

1713 + */

1714 + public function _markDataTables(DOMDocument $article)

1715 + {

1716 + $tables = $article->getElementsByTagName('table');

1717 + foreach ($tables as $table) {

1718 + /** @var DOMElement $table */

1719 + $role = $table->getAttribute('role');

1720 + if ($role === 'presentation') {

1721 + $table->setReadabilityDataTable(false);

1722 + continue;

1723 + }

1724 + $datatable = $table->getAttribute('datatable');

1725 + if ($datatable == '0') {

1726 + $table->setReadabilityDataTable(false);

1727 + continue;

1728 + }

1729 + $summary = $table->getAttribute('summary');

1730 + if ($summary) {

1731 + $table->setReadabilityDataTable(true);

1732 + continue;

1733 + }

1734 +

1735 + $caption = $table->getElementsByTagName('caption');

1736 + if ($caption->length > 0 && $caption->item(0)->childNodes->length > 0) {

1737 + $table->setReadabilityDataTable(true);

1738 + continue;

1739 + }

1740 +

1741 + // If the table has a descendant with any of these tags, consider a data table:

1742 + foreach (['col', 'colgroup', 'tfoot', 'thead', 'th'] as $dataTableDescendants) {

1743 + if ($table->getElementsByTagName($dataTableDescendants)->length > 0) {

1744 + $table->setReadabilityDataTable(true);

1745 + continue 2;

1746 + }

1747 + }

1748 +

1749 + // Nested tables indicate a layout table:

1750 + if ($table->getElementsByTagName('table')->length > 0) {

1751 + $table->setReadabilityDataTable(false);

1752 + continue;

1753 + }

1754 +

1755 + $sizeInfo = $table->getRowAndColumnCount();

1756 + if ($sizeInfo['rows'] >= 10 || $sizeInfo['columns'] > 4) {

1757 + $table->setReadabilityDataTable(true);

1758 + continue;

1759 + }

1760 + // Now just go by size entirely:

1761 + $table->setReadabilityDataTable($sizeInfo['rows'] * $sizeInfo['columns'] > 10);

1762 + }

1763 + }

1764 +

1765 + /**

1766 + * convert images and figures that have properties like data-src into images that can be loaded without JS

1767 + *

1768 + * @param DOMDocument $article

1769 + *

1770 + * @return void

1771 + */

1772 + public function _fixLazyImages(DOMDocument $article)

1773 + {

1774 + $images = $this->_getAllNodesWithTag($article, ['img', 'picture', 'figure']);

1775 + foreach ($images as $elem) {

1776 + // In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.

1777 + // So, here we check if the data uri is too short, just might as well remove it.

1778 + if ($elem->getAttribute('src') && preg_match(NodeUtility::$regexps['b64DataUrl'], $elem->getAttribute('src'), $parts)) {

1779 + // Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.

1780 + if ($parts[1] === 'image/svg+xml') {

1781 + continue;

1782 + }

1783 +

1784 + // Make sure this element has other attributes which contains image.

1785 + // If it doesn't, then this src is important and shouldn't be removed.

1786 + $srcCouldBeRemoved = false;

1787 + for ($i = 0; $i < $elem->attributes->length; $i++) {

1788 + $attr = $elem->attributes->item($i);

1789 + if ($attr->name === 'src') {

1790 + continue;

1791 + }

1792 +

1793 + if (preg_match('/\.(jpg|jpeg|png|webp)/i', $attr->value)) {

1794 + $srcCouldBeRemoved = true;

1795 + break;

1796 + }

1797 + }

1798 +

1799 + // Here we assume if image is less than 100 bytes (or 133B after encoded to base64)

1800 + // it will be too small, therefore it might be placeholder image.

1801 + if ($srcCouldBeRemoved) {

1802 + $b64starts = stripos($elem->getAttribute('src'), 'base64') + 7;

1803 + $b64length = strlen($elem->getAttribute('src')) - $b64starts;

1804 + if ($b64length < 133) {

1805 + $elem->removeAttribute('src');

1806 + }

1807 + }

1808 + }

1809 +

1810 + // Don't remove if there's a src or srcset attribute, and there's no sign of 'lazy' loading in the class

1811 + // attribute value.

1812 + if (($elem->getAttribute('src') || $elem->getAttribute('srcset')) && readability_mb_stripos($elem->getAttribute('class'), 'lazy') === false) {

1813 + continue;

1814 + }

1815 +

1816 + for ($j = 0; $j < $elem->attributes->length; $j++) {

1817 + $attr = $elem->attributes->item($j);

1818 + if ($attr->name === 'src' || $attr->name === 'srcset' || $attr->name === 'alt') {

1819 + continue;

1820 + }

1821 + $copyTo = null;

1822 + if (preg_match('/\.(jpg|jpeg|png|webp)\s+\d/', $attr->value)) {

1823 + $copyTo = 'srcset';

1824 + } elseif (preg_match('/^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/', $attr->value)) {

1825 + $copyTo = 'src';

1826 + }

1827 + if ($copyTo) {

1828 + //if this is an img or picture, set the attribute directly

1829 + if ($elem->tagName === 'img' || $elem->tagName === 'picture') {

1830 + $elem->setAttribute($copyTo, $attr->value);

1831 + } elseif ($elem->tagName === 'figure' && empty($this->_getAllNodesWithTag($elem, ['img', 'picture']))) {

1832 + //if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure

1833 + //see the nytimes-3 testcase for an example

1834 + $img = $article->createElement('img');

1835 + $img->setAttribute($copyTo, $attr->value);

1836 + $elem->appendChild($img);

1837 + }

1838 + }

1839 + }

1840 + }

1841 + }

1842 +

1843 + /**

1844 + * Remove the style attribute on every e and under.

1845 + *

1846 + * @param $node DOMDocument|DOMNode

1847 + **/

1848 + public function _cleanStyles($node)

1849 + {

1850 + if (property_exists($node, 'tagName') && $node->tagName === 'svg') {

1851 + return;

1852 + }

1853 +

1854 + // Do not bother if there's no method to remove an attribute

1855 + if (method_exists($node, 'removeAttribute')) {

1856 + $presentational_attributes = ['align', 'background', 'bgcolor', 'border', 'cellpadding', 'cellspacing', 'frame', 'hspace', 'rules', 'style', 'valign', 'vspace'];

1857 + // Remove `style` and deprecated presentational attributes

1858 + foreach ($presentational_attributes as $presentational_attribute) {

1859 + $node->removeAttribute($presentational_attribute);

1860 + }

1861 +

1862 + $deprecated_size_attribute_elems = ['table', 'th', 'td', 'hr', 'pre'];

1863 + if (property_exists($node, 'tagName') && in_array($node->tagName, $deprecated_size_attribute_elems)) {

1864 + $node->removeAttribute('width');

1865 + $node->removeAttribute('height');

1866 + }

1867 + }

1868 +

1869 + $cur = $node->firstChild;

1870 + while ($cur !== null) {

1871 + $this->_cleanStyles($cur);

1872 + $cur = $cur->nextSibling;

1873 + }

1874 + }

1875 +

1876 + /**

1877 + * Clean out elements that match the specified conditions

1878 + *

1879 + * @param $node DOMElement Node to clean

1880 + * @param $filter callable Function determines whether a node should be removed

1881 + *

1882 + * @return void

1883 + **/

1884 + public function _cleanMatchedNodes($node, callable $filter)

1885 + {

1886 + $endOfSearchMarkerNode = NodeUtility::getNextNode($node, true);

1887 + $next = NodeUtility::getNextNode($node);

1888 + while ($next && $next !== $endOfSearchMarkerNode) {

1889 + if ($filter($next, sprintf('%s %s', $next->getAttribute('class'), $next->getAttribute('id')))) {

1890 + $this->logger->debug(sprintf('Removing matched node, node class was: \'%s\', id: \'%s\'', $next->getAttribute('class'), $next->getAttribute('id')));

1891 + $next = NodeUtility::removeAndGetNext($next);

1892 + } else {

1893 + $next = NodeUtility::getNextNode($next);

1894 + }

1895 + }

1896 + }

1897 +

1898 + /**

1899 + * @param DOMDocument $article

1900 + *

1901 + * @return void

1902 + */

1903 + public function _cleanExtraParagraphs(DOMDocument $article)

1904 + {

1905 + $paragraphs = $this->_getAllNodesWithTag($article, ['p']);

1906 + $length = count($paragraphs);

1907 +

1908 + for ($i = 0; $i < $length; $i++) {

1909 + $paragraph = $paragraphs[$length - 1 - $i];

1910 +

1911 + $imgCount = $paragraph->getElementsByTagName('img')->length;

1912 + $embedCount = $paragraph->getElementsByTagName('embed')->length;

1913 + $objectCount = $paragraph->getElementsByTagName('object')->length;

1914 + // At this point, nasty iframes have been removed, only remain embedded video ones.

1915 + $iframeCount = $paragraph->getElementsByTagName('iframe')->length;

1916 + $totalCount = $imgCount + $embedCount + $objectCount + $iframeCount;

1917 +

1918 + if ($totalCount === 0 && !preg_replace(NodeUtility::$regexps['onlyWhitespace'], '', $paragraph->textContent)) {

1919 + $this->logger->debug(sprintf('[PrepArticle] Removing extra paragraph. Text content was: \'%s\'', substr($paragraph->textContent, 0, 128)));

1920 + $paragraph->parentNode->removeChild($paragraph);

1921 + }

1922 + }

1923 + }

1924 +

1925 + private function getTextDensity($e, array $tags) {

1926 + $textLength = readability_mb_strlen($e->getTextContent(true));

1927 + if ($textLength == 0) {

1928 + return 0;

1929 + }

1930 + $childrenLength = 0;

1931 + $children = $this->_getAllNodesWithTag($e, $tags);

1932 + foreach ($children as $child) {

1933 + $childrenLength += readability_mb_strlen($child->getTextContent(true));

1934 + }

1935 + return $childrenLength / $textLength;

1936 + }

1937 +

1938 + /**

1939 + * @param DOMDocument $article

1940 + * @param string $tag Tag to clean conditionally

1941 + *

1942 + * @return void

1943 + */

1944 + public function _cleanConditionally(DOMDocument $article, $tag)

1945 + {

1946 + if (!$this->configuration->getCleanConditionally()) {

1947 + return;

1948 + }

1949 +

1950 + /*

1951 + * Gather counts for other typical elements embedded within.

1952 + * Traverse backwards so we can remove nodes at the same time

1953 + * without effecting the traversal.

1954 + */

1955 +

1956 + $allNodesWithTag = $this->_getAllNodesWithTag($article, [$tag]);

1957 + $length = count($allNodesWithTag);

1958 + for ($i = 0; $i < $length; $i++) {

1959 + /** @var $node DOMElement */

1960 + $node = $allNodesWithTag[$length - 1 - $i];

1961 +

1962 + $isList = in_array($tag, ['ul', 'ol']);

1963 + /*

1964 + // Doesn't seem to work as expected

1965 + // compared to JS version: https://github.com/mozilla/readability/commit/3c833899866ffb1f9130767110197fd6f5c08d4c

1966 + if (!$isList) {

1967 + $listLength = 0;

1968 + $listNodes = $this->_getAllNodesWithTag($node, ['ul', 'ol']);

1969 + array_walk($listNodes, function ($list) use(&$listLength) {

1970 + $listLength += readability_mb_strlen($list->getTextContent());

1971 + });

1972 + $nodeTextLength = readability_mb_strlen($node->getTextContent());

1973 + if (!$nodeTextLength) {

1974 + $isList = true;

1975 + } else {

1976 + $isList = $listLength / $nodeTextLength > 0.9;

1977 + }

1978 + }

1979 + */

1980 +

1981 + // First check if this node IS data table, in which case don't remove it.

1982 + if ($tag === 'table' && $node->isReadabilityDataTable()) {

1983 + continue;

1984 + }

1985 +

1986 + // Next check if we're inside a data table, in which case don't remove it as well.

1987 + if ($node->hasAncestorTag('table', -1, function ($node) {

1988 + return $node->isReadabilityDataTable();

1989 + })) {

1990 + continue;

1991 + }

1992 +

1993 + if ($node->hasAncestorTag('code')) {

1994 + continue;

1995 + }

1996 +

1997 + $weight = 0;

1998 + if ($this->configuration->getWeightClasses()) {

1999 + $weight = $node->getClassWeight();

2000 + }

2001 +

2002 + if ($weight < 0) {

2003 + $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\' with 0 or less weight', $tag));

2004 +

2005 + NodeUtility::removeNode($node);

2006 + continue;

2007 + }

2008 +

2009 + if (substr_count($node->getTextContent(false), ',') < 10) {

2010 + /*

2011 + * If there are not very many commas, and the number of

2012 + * non-paragraph elements is more than paragraphs or other

2013 + * ominous signs, remove the element.

2014 + */

2015 +

2016 + $p = $node->getElementsByTagName('p')->length;

2017 + $img = $node->getElementsByTagName('img')->length;

2018 + $li = $node->getElementsByTagName('li')->length - 100;

2019 + $input = $node->getElementsByTagName('input')->length;

2020 + $headingDensity = $this->getTextDensity($node, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']);

2021 +

2022 + $embedCount = 0;

2023 + $embeds = $this->_getAllNodesWithTag($node, ['object', 'embed', 'iframe']);

2024 +

2025 + foreach ($embeds as $embedNode) {

2026 + for ($j = 0; $j < $embedNode->attributes->length; $j++) {

2027 + if (preg_match(NodeUtility::$regexps['videos'], $embedNode->attributes->item($j)->nodeValue)) {

2028 + continue 3;

2029 + }

2030 + }

2031 +

2032 + // For embed with <object> tag, check inner HTML as well.

2033 + if ($embedNode->tagName === "object" && preg_match(NodeUtility::$regexps['videos'], $embedNode->C14N())) {

2034 + continue 2;

2035 + }

2036 +

2037 + $embedCount++;

2038 + }

2039 +

2040 + $linkDensity = $node->getLinkDensity();

2041 + $contentLength = readability_mb_strlen($node->getTextContent(true));

2042 +

2043 + $haveToRemove =

2044 + ($img > 1 && $p / $img < 0.5 && !$node->hasAncestorTag('figure')) ||

2045 + (!$isList && $li > $p) ||

2046 + ($input > floor($p / 3)) ||

2047 + (!$isList && $headingDensity < 0.9 && $contentLength < 25 && ($img === 0 || $img > 2) && !$node->hasAncestorTag('figure')) ||

2048 + (!$isList && $weight < 25 && $linkDensity > 0.2) ||

2049 + ($weight >= 25 && $linkDensity > 0.5) ||

2050 + (($embedCount === 1 && $contentLength < 75) || $embedCount > 1);

2051 +

2052 + if ($haveToRemove) {

2053 + $this->logger->debug(sprintf('[PrepArticle] Removing tag \'%s\'.', $tag));

2054 +

2055 + NodeUtility::removeNode($node);

2056 + }

2057 + }

2058 + }

2059 + }

2060 +

2061 + public function _getAllNodesWithTag($node, array $tagNames) {

2062 + $nodes = [];

2063 + foreach ($tagNames as $tag) {

2064 + $nodeList = $node->getElementsByTagName($tag);

2065 + foreach ($nodeList as $n) {

2066 + $nodes[] = $n;

2067 + }

2068 + }

2069 + return $nodes;

2070 + }

2071 +

2072 + /**

2073 + * Clean a node of all elements of type "tag".

2074 + * (Unless it's a youtube/vimeo video. People love movies.).

2075 + *

2076 + * @param $article DOMDocument

2077 + * @param $tag string tag to clean

2078 + *

2079 + * @return void

2080 + **/

2081 + public function _clean(DOMDocument $article, $tag)

2082 + {

2083 + $isEmbed = in_array($tag, ['object', 'embed', 'iframe']);

2084 +

2085 + $allNodesWithTag = $this->_getAllNodesWithTag($article, [$tag]);

2086 + $length = count($allNodesWithTag);

2087 + for ($i = 0; $i < $length; $i++) {

2088 + $item = $allNodesWithTag[$length - 1 - $i];

2089 +

2090 + // Allow youtube and vimeo videos through as people usually want to see those.

2091 + if ($isEmbed) {

2092 + $attributeValues = [];

2093 + foreach ($item->attributes as $value) {

2094 + $attributeValues[] = $value->nodeValue;

2095 + }

2096 + $attributeValues = implode('|', $attributeValues);

2097 +

2098 + // First, check the elements attributes to see if any of them contain youtube or vimeo

2099 + if (preg_match(NodeUtility::$regexps['videos'], $attributeValues)) {

2100 + continue;

2101 + }

2102 +

2103 + // For embed with <object> tag, check inner HTML as well.

2104 + if ($item->tagName === 'object' && preg_match(NodeUtility::$regexps['videos'], $item->C14N())) {

2105 + continue;

2106 + }

2107 + }

2108 + $this->logger->debug(sprintf('[PrepArticle] Removing node \'%s\'.', $item->tagName));

2109 +

2110 + NodeUtility::removeNode($item);

2111 + }

2112 + }

2113 +

2114 + /**

2115 + * Clean out spurious headers from an Element.

2116 + *

2117 + * @param DOMDocument $article

2118 + *

2119 + * @return void

2120 + **/

2121 + public function _cleanHeaders(DOMDocument $article)

2122 + {

2123 + $headingNodes = $this->_getAllNodesWithTag($article, ['h1', 'h2']);

2124 + /** @var $header DOMElement */

2125 + foreach ($headingNodes as $header) {

2126 + $weight = 0;

2127 + if ($this->configuration->getWeightClasses()) {

2128 + $weight = $header->getClassWeight();

2129 + }

2130 + $shouldRemove = $weight < 0;

2131 +

2132 + if ($shouldRemove) {

2133 + $this->logger->debug(sprintf('[PrepArticle] Removing H node with 0 or less weight. Content was: \'%s\'', substr($header->nodeValue, 0, 128)));

2134 +

2135 + NodeUtility::removeNode($header);

2136 + }

2137 + }

2138 + }

2139 +

2140 + /**

2141 + * Check if this node is an H1 or H2 element whose content is mostly

2142 + * the same as the article title.

2143 + *

2144 + * @param DOMNode the node to check.

2145 + * @return boolean indicating whether this is a title-like header.

2146 + */

2147 + private function headerDuplicatesTitle($node) {

2148 + if ($node->nodeName !== 'h1' && $node->nodeName !== 'h2') {

2149 + return false;

2150 + }

2151 + if (!isset($this->title)) {

2152 + return false;

2153 + }

2154 + $heading = $node->getTextContent(false);

2155 + $this->logger->debug(sprintf('Evaluating similarity of header: %s"', $heading));

2156 + return $this->textSimilarity($this->title, $heading) > 0.75;

2157 + }

2158 +

2159 + /**

2160 + * Removes the class="" attribute from every element in the given

2161 + * subtree.

2162 + *

2163 + * Readability.js has a special filter to avoid cleaning the classes that the algorithm adds. We don't add classes

2164 + * here so no need to filter those.

2165 + *

2166 + * @param DOMDocument|DOMNode $node

2167 + *

2168 + * @return void

2169 + **/

2170 + public function _cleanClasses($node)

2171 + {

2172 + if ($node->getAttribute('class') !== '') {

2173 + $node->removeAttribute('class');

2174 + }

2175 +

2176 + for ($node = $node->getFirstElementChild(); $node !== null; $node = $node->nextSibling) {

2177 + $this->_cleanClasses($node);

2178 + }

2179 + }

2180 +

2181 + /**

2182 + * @param DOMDocument $article

2183 + *

2184 + * @return DOMDocument

2185 + */

2186 + public function postProcessContent(DOMDocument $article)

2187 + {

2188 + $this->logger->info('[PostProcess] PostProcessing content...');

2189 +

2190 + // Readability cannot open relative uris so we convert them to absolute uris.

2191 + if ($this->configuration->getFixRelativeURLs()) {

2192 + foreach (iterator_to_array($article->getElementsByTagName('a')) as $link) {

2193 + /** @var DOMElement $link */

2194 + $href = $link->getAttribute('href');

2195 + if ($href) {

2196 + // Remove links with javascript: URIs, since

2197 + // they won't work after scripts have been removed from the page.

2198 + if (strpos($href, 'javascript:') === 0) {

2199 + $this->logger->debug(sprintf('[PostProcess] Removing \'javascript:\' link. Content is: \'%s\'', substr($link->textContent, 0, 128)));

2200 +

2201 + // if the link only contains simple text content, it can be converted to a text node

2202 + if ($link->childNodes->length === 1 && $link->childNodes->item(0)->nodeType === XML_TEXT_NODE) {

2203 + $text = $article->createTextNode($link->textContent);

2204 + $link->parentNode->replaceChild($text, $link);

2205 + } else {

2206 + // if the link has multiple children, they should all be preserved

2207 + $container = $article->createElement('span');

2208 + while ($link->firstChild) {

2209 + $container->appendChild($link->firstChild);

2210 + }

2211 + $link->parentNode->replaceChild($container, $link);

2212 + }

2213 + } else {

2214 + $this->logger->debug(sprintf('[PostProcess] Converting link to absolute URI: \'%s\'', substr($href, 0, 128)));

2215 +

2216 + $link->setAttribute('href', $this->toAbsoluteURI($href));

2217 + }

2218 + }

2219 + }

2220 +

2221 + $medias = $this->_getAllNodesWithTag($article, [

2222 + 'img', 'picture', 'figure', 'video', 'audio', 'source'

2223 + ]);

2224 +

2225 + array_walk($medias, function ($media) {

2226 + $src = $media->getAttribute('src');

2227 + $poster = $media->getAttribute('poster');

2228 + $srcset = $media->getAttribute('srcset');

2229 +

2230 + if ($src) {

2231 + $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($src, 0, 128)));

2232 +

2233 + $media->setAttribute('src', $this->toAbsoluteURI($src));

2234 + }

2235 +

2236 + if ($poster) {

2237 + $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($poster, 0, 128)));

2238 +

2239 + $media->setAttribute('poster', $this->toAbsoluteURI($poster));

2240 + }

2241 +

2242 + if ($srcset) {

2243 + $newSrcset = preg_replace_callback(NodeUtility::$regexps['srcsetUrl'], function ($matches) {

2244 + $this->logger->debug(sprintf('[PostProcess] Converting image URL to absolute URI: \'%s\'', substr($matches[1], 0, 128)));

2245 +

2246 + return $this->toAbsoluteURI($matches[1]) . $matches[2] . $matches[3];

2247 + }, $srcset);

2248 +

2249 + $media->setAttribute('srcset', $newSrcset);

2250 + }

2251 + });

2252 + }

2253 +

2254 + $this->simplifyNestedElements($article);

2255 +

2256 + if (!$this->configuration->getKeepClasses()) {

2257 + $this->_cleanClasses($article);

2258 + }

2259 +

2260 + return $article;

2261 + }

2262 +

2263 + /**

2264 + * Iterate over a NodeList, and return the first node that passes

2265 + * the supplied test function

2266 + *

2267 + * @param NodeList nodeList The NodeList.

2268 + * @param Function fn The test function.

2269 + * @return DOMNode|null

2270 + */

2271 + private function findNode(array $nodeList, callable $fn)

2272 + {

2273 + foreach ($nodeList as $node) {

2274 + if ($fn($node)) {

2275 + return $node;

2276 + }

2277 + }

2278 + return null;

2279 + }

2280 +

2281 + /**

2282 + * @return null|string

2283 + */

2284 + public function __toString()

2285 + {

2286 + return sprintf('<h1>%s</h1>%s', $this->getTitle(), $this->getContent());

2287 + }

2288 +

2289 + /**

2290 + * @return string|null

2291 + */

2292 + public function getTitle()

2293 + {

2294 + return $this->title;

2295 + }

2296 +

2297 + /**

2298 + * @param string $title

2299 + */

2300 + protected function setTitle($title)

2301 + {

2302 + $this->title = $title;

2303 + }

2304 +

2305 + /**

2306 + * @return string|null

2307 + */

2308 + public function getContent()

2309 + {

2310 + if ($this->content instanceof DOMDocument) {

2311 + $html5 = new HTML5(['disable_html_ns' => true]);

2312 + // by using childNodes below we make sure HTML5PHP's serialiser

2313 + // doesn't output the <!DOCTYPE html> string at the start.

2314 + return $html5->saveHTML($this->content->childNodes);

2315 + } else {

2316 + return null;

2317 + }

2318 + }

2319 +

2320 + /**

2321 + * @return DOMDocument|null

2322 + */

2323 + public function getDOMDocument()

2324 + {

2325 + return $this->content;

2326 + }

2327 +

2328 + /**

2329 + * @param DOMDocument $content

2330 + */

2331 + protected function setContent(DOMDocument $content)

2332 + {

2333 + $this->content = $content;

2334 + }

2335 +

2336 + /**

2337 + * @return null|string

2338 + */

2339 + public function getExcerpt()

2340 + {

2341 + return $this->excerpt;

2342 + }

2343 +

2344 + /**

2345 + * @param null|string $excerpt

2346 + */

2347 + public function setExcerpt($excerpt)

2348 + {

2349 + $this->excerpt = $excerpt;

2350 + }

2351 +

2352 + /**

2353 + * @return string|null

2354 + */

2355 + public function getImage()

2356 + {

2357 + return $this->image;

2358 + }

2359 +

2360 + /**

2361 + * @param string $image

2362 + */

2363 + protected function setImage($image)

2364 + {

2365 + $this->image = $image;

2366 + }

2367 +

2368 + /**

2369 + * @return string|null

2370 + */

2371 + public function getAuthor()

2372 + {

2373 + return $this->author;

2374 + }

2375 +

2376 + /**

2377 + * @param string $author

2378 + */

2379 + protected function setAuthor($author)

2380 + {

2381 + $this->author = $author;

2382 + }

2383 +

2384 + /**

2385 + * @return string|null

2386 + */

2387 + public function getSiteName()

2388 + {

2389 + return $this->siteName;

2390 + }

2391 +

2392 + /**

2393 + * @param string $siteName

2394 + */

2395 + protected function setSiteName($siteName)

2396 + {

2397 + $this->siteName = $siteName;

2398 + }

2399 +

2400 + /**

2401 + * @return null|string

2402 + */

2403 + public function getDirection()

2404 + {

2405 + return $this->direction;

2406 + }

2407 +

2408 + /**

2409 + * @param null|string $direction

2410 + */

2411 + public function setDirection($direction)

2412 + {

2413 + $this->direction = $direction;

2414 + }

2415 + }

2416 +

Diff: STRATO-apps/wordpress_03/app/wp-content/plugins/aimogen-pro/res/readability/Readability.php