diff options
author | Aleteoryx <alyx@aleteoryx.me> | 2024-11-14 16:23:52 -0500 |
---|---|---|
committer | Aleteoryx <alyx@aleteoryx.me> | 2024-11-14 16:23:52 -0500 |
commit | 1551d2bda2ad95231f9cde3b7cf94a4a1e01a75f (patch) | |
tree | ef229301b1add213db87a20660fb1ba7a1cf15cc | |
parent | 4fb02a34f4f6e3dd4a9c55a604fffc65da368c14 (diff) | |
download | rss_dot_php-1551d2bda2ad95231f9cde3b7cf94a4a1e01a75f.tar.gz rss_dot_php-1551d2bda2ad95231f9cde3b7cf94a4a1e01a75f.tar.bz2 rss_dot_php-1551d2bda2ad95231f9cde3b7cf94a4a1e01a75f.zip |
parse out html
-rw-r--r-- | rss.php | 87 |
1 files changed, 74 insertions, 13 deletions
@@ -53,7 +53,7 @@ $config["link_target"] = "_top"; /* --- CODE - DO NOT TOUCH --- */ -function load_rss(string $uri, string $linkrel = "alternate"): array { +function load_rss(string $uri, string $linkrel = "alternate", bool? $allow_html = NULL): array { global $config; $xml = file_get_contents($uri); @@ -78,11 +78,24 @@ function load_rss(string $uri, string $linkrel = "alternate"): array { foreach ($doc->getElementsByTagName("item") as $node) { $data["title"] = $node->getElementsByTagName("title") ?->item(0)?->textContent; + $data["title"] ??= "[[[No Title]]]"; + $data["title"] = htmlentities(html_entity_decode($data["title"])); + $data["link"] ??= $node->getElementsByTagName("link") ?->item(0)?->textContent; + $data["link"] ??= htmlentities($data["link"]); + + // assume rss is html by default + $data["content"] = $node->getElementsByTagName("description") + ?->item(0)?->textContent; + if ($allow_html === TRUE || $allow_html === NULL) { + $data["content"] = strip_html($data["content"]); + } else { + $data["content"] = htmlentities(html_entity_decode($data["content"])); + } $data["date"] = new DateTime($node->getElementsByTagName("pubDate") - ->item(0)->textContent); + ?->item(0)?->textContent ?? '@0'); $data["date"]->setTimezone($config["timezone"]); $parsed[] = $data; @@ -92,12 +105,26 @@ function load_rss(string $uri, string $linkrel = "alternate"): array { foreach ($doc->getElementsByTagName("entry") as $node) { $data["title"] = $node->getElementsByTagName("title") ?->item(0)?->textContent; + $data["title"] ??= "[[[No Title]]]"; + $data["title"] = htmlentities(html_entity_decode($data["title"])); + + $data["content"] = $node->getElementsByTagName("content") + ?->item(0)?->textContent; + + if ($node->getElementsByTagName("content") + ?->item(0) + ?->getAttribute("type") === "html" && $allow_html !== FALSE) { + $data["content"] = strip_html($data["content"]) + } else { + $data["content"] = htmlentities(html_entity_decode($data["content"])); + } + $data["links"] = []; foreach ($node->getElementsByTagName("link")->getIterator() as $link) { - $date["links"][] = ["rel" => $link->getAttribute("rel"), - "href" => $link->getAttribute("href")]; + $date["links"][] = ["rel" => htmlentities($link->getAttribute("rel")), + "href" => htmlentities($link->getAttribute("href"))]; if ($link->getAttribute("rel") === $linkrel) { - $data["link"] ??= $link->getAttribute("href"); + $data["link"] ??= htmlentities($link->getAttribute("href")); } } $data["link"] ??= @$data["links"][0]; @@ -106,7 +133,7 @@ function load_rss(string $uri, string $linkrel = "alternate"): array { ?->item(0)?->textContent; $data["date"] ??= $node->getElementsByTagName("updated") ?->item(0)?->textContent; - $data["date"] = new DateTime($data["date"]); + $data["date"] = new DateTime($data["date"] ?? '@0'); $data["date"]->setTimezone($config["timezone"]); $parsed[] = $data; @@ -131,6 +158,40 @@ function load_cached(int $ttl, string $uri, string $linkrel = "alternate"): arra } } +// potentially unsafe, shouldn't matter cause source is always trusted +// TODO: sniff for 8.4 Dom\HTMLDocument when 8.4 releases +// <https://www.php.net/manual/en/domdocument.loadhtml.php> +function strip_html(string $html): string { + $doc = new DomDocument(); + @$doc->loadHTML($html); + + foreach($doc->getElementsByTagName("style")->getIterator() as $el) + $el->remove(); + foreach($doc->getElementsByTagName("script")->getIterator() as $el) + $el->remove(); + foreach($doc->getElementsByTagName("link")->getIterator() as $el) + $el->remove(); + + foreach($doc->getElementsByTagName("*")->getIterator() as $el) { + if (str_starts_with($el->getAttribute("href"), "javascript:")) + $el->setAttribute("javascript:alert('Link stripped for security.')"); + if (str_starts_with($el->getAttribute("src"), "javascript:")) + $el->setAttribute("javascript:alert('Link stripped for security.')"); + } + + return implode( + array_map( + fn($x) => $doc->saveHTML($x), + iterator_to_array( + $doc->getElementsByTagName("body") + ->item(0) + ->childNodes + ->getiterator()))); +} + +// code begins here +$config["link_target"] = htmlentities($config["link_target"]); + @mkdir($config["cache_dir"], recursive: true); foreach ($_GET["disabled"]??[] as $idx => $feed) { @@ -153,8 +214,8 @@ foreach ($feeds as $name => $data) { if (!isset($data["linkrel"])) $data["linkrel"] = "alternate"; foreach(load_cached($data["ttl"], $data["url"], $data["linkrel"]) as $entry) { - $entry["source"] = $name; - $entry["home"] = @$data["home"]; + $entry["source"] = htmlentities($name); + $entry["home"] = htmlentities(@$data["home"]); $combined[] = $entry; } } @@ -205,16 +266,16 @@ $base = parse_url($_SERVER["REQUEST_URI"], PHP_URL_PATH); <?php endif; foreach ($combined as $entry): ?> <article> - <h1><a target="<?= htmlentities($config['link_target']) ?>" href="<?= htmlentities($entry['link']) ?>"><?= htmlentities($entry['title'] ?? "[[[No Title]]]") ?></a></h1> - <span class="source" data-source="<?= htmlentities($entry['source']) ?>"> + <h1><a target="<?= $config['link_target'] ?>" href="<?= $entry['link'] ?>"><?= $entry['title'] ?></a></h1> + <span class="source" data-source="<?= $entry['source'] ?>"> <?php if ($entry['home']): ?> - <a target="<?= htmlentities($config['link_target']) ?>" href="<?= htmlentities($entry['home']) ?>"><?= htmlentities($entry['source']) ?></a> + <a target="<?= $config['link_target'] ?>" href="<?= $entry['home'] ?>"><?= $entry['source'] ?></a> <?php else: ?> - <?= htmlentities($entry['source']) ?> + <?= $entry['source'] ?> <?php endif; ?> </span> • - <time datetime="<?= htmlentities($entry['date']->format(DateTime::ISO8601)) ?>"> + <time datetime="<?= $entry['date']->format(DateTime::ISO8601) ?>"> <?= htmlentities($entry['date']->format($config['date_fmt'])) ?> </time> </article> |