aboutsummaryrefslogtreecommitdiffstats
path: root/rss.php
diff options
context:
space:
mode:
authorAleteoryx <alyx@aleteoryx.me>2024-11-14 16:23:52 -0500
committerAleteoryx <alyx@aleteoryx.me>2024-11-14 16:23:52 -0500
commit1551d2bda2ad95231f9cde3b7cf94a4a1e01a75f (patch)
treeef229301b1add213db87a20660fb1ba7a1cf15cc /rss.php
parent4fb02a34f4f6e3dd4a9c55a604fffc65da368c14 (diff)
downloadrss_dot_php-1551d2bda2ad95231f9cde3b7cf94a4a1e01a75f.tar.gz
rss_dot_php-1551d2bda2ad95231f9cde3b7cf94a4a1e01a75f.tar.bz2
rss_dot_php-1551d2bda2ad95231f9cde3b7cf94a4a1e01a75f.zip
parse out html
Diffstat (limited to 'rss.php')
-rw-r--r--rss.php87
1 files changed, 74 insertions, 13 deletions
diff --git a/rss.php b/rss.php
index 14caeaa..783a119 100644
--- a/rss.php
+++ b/rss.php
@@ -53,7 +53,7 @@ $config["link_target"] = "_top";
/* --- CODE - DO NOT TOUCH --- */
-function load_rss(string $uri, string $linkrel = "alternate"): array {
+function load_rss(string $uri, string $linkrel = "alternate", bool? $allow_html = NULL): array {
global $config;
$xml = file_get_contents($uri);
@@ -78,11 +78,24 @@ function load_rss(string $uri, string $linkrel = "alternate"): array {
foreach ($doc->getElementsByTagName("item") as $node) {
$data["title"] = $node->getElementsByTagName("title")
?->item(0)?->textContent;
+ $data["title"] ??= "[[[No Title]]]";
+ $data["title"] = htmlentities(html_entity_decode($data["title"]));
+
$data["link"] ??= $node->getElementsByTagName("link")
?->item(0)?->textContent;
+ $data["link"] ??= htmlentities($data["link"]);
+
+ // assume rss is html by default
+ $data["content"] = $node->getElementsByTagName("description")
+ ?->item(0)?->textContent;
+ if ($allow_html === TRUE || $allow_html === NULL) {
+ $data["content"] = strip_html($data["content"]);
+ } else {
+ $data["content"] = htmlentities(html_entity_decode($data["content"]));
+ }
$data["date"] = new DateTime($node->getElementsByTagName("pubDate")
- ->item(0)->textContent);
+ ?->item(0)?->textContent ?? '@0');
$data["date"]->setTimezone($config["timezone"]);
$parsed[] = $data;
@@ -92,12 +105,26 @@ function load_rss(string $uri, string $linkrel = "alternate"): array {
foreach ($doc->getElementsByTagName("entry") as $node) {
$data["title"] = $node->getElementsByTagName("title")
?->item(0)?->textContent;
+ $data["title"] ??= "[[[No Title]]]";
+ $data["title"] = htmlentities(html_entity_decode($data["title"]));
+
+ $data["content"] = $node->getElementsByTagName("content")
+ ?->item(0)?->textContent;
+
+ if ($node->getElementsByTagName("content")
+ ?->item(0)
+ ?->getAttribute("type") === "html" && $allow_html !== FALSE) {
+ $data["content"] = strip_html($data["content"])
+ } else {
+ $data["content"] = htmlentities(html_entity_decode($data["content"]));
+ }
+
$data["links"] = [];
foreach ($node->getElementsByTagName("link")->getIterator() as $link) {
- $date["links"][] = ["rel" => $link->getAttribute("rel"),
- "href" => $link->getAttribute("href")];
+ $date["links"][] = ["rel" => htmlentities($link->getAttribute("rel")),
+ "href" => htmlentities($link->getAttribute("href"))];
if ($link->getAttribute("rel") === $linkrel) {
- $data["link"] ??= $link->getAttribute("href");
+ $data["link"] ??= htmlentities($link->getAttribute("href"));
}
}
$data["link"] ??= @$data["links"][0];
@@ -106,7 +133,7 @@ function load_rss(string $uri, string $linkrel = "alternate"): array {
?->item(0)?->textContent;
$data["date"] ??= $node->getElementsByTagName("updated")
?->item(0)?->textContent;
- $data["date"] = new DateTime($data["date"]);
+ $data["date"] = new DateTime($data["date"] ?? '@0');
$data["date"]->setTimezone($config["timezone"]);
$parsed[] = $data;
@@ -131,6 +158,40 @@ function load_cached(int $ttl, string $uri, string $linkrel = "alternate"): arra
}
}
+// potentially unsafe, shouldn't matter cause source is always trusted
+// TODO: sniff for 8.4 Dom\HTMLDocument when 8.4 releases
+// <https://www.php.net/manual/en/domdocument.loadhtml.php>
+function strip_html(string $html): string {
+ $doc = new DomDocument();
+ @$doc->loadHTML($html);
+
+ foreach($doc->getElementsByTagName("style")->getIterator() as $el)
+ $el->remove();
+ foreach($doc->getElementsByTagName("script")->getIterator() as $el)
+ $el->remove();
+ foreach($doc->getElementsByTagName("link")->getIterator() as $el)
+ $el->remove();
+
+ foreach($doc->getElementsByTagName("*")->getIterator() as $el) {
+ if (str_starts_with($el->getAttribute("href"), "javascript:"))
+ $el->setAttribute("javascript:alert('Link stripped for security.')");
+ if (str_starts_with($el->getAttribute("src"), "javascript:"))
+ $el->setAttribute("javascript:alert('Link stripped for security.')");
+ }
+
+ return implode(
+ array_map(
+ fn($x) => $doc->saveHTML($x),
+ iterator_to_array(
+ $doc->getElementsByTagName("body")
+ ->item(0)
+ ->childNodes
+ ->getiterator())));
+}
+
+// code begins here
+$config["link_target"] = htmlentities($config["link_target"]);
+
@mkdir($config["cache_dir"], recursive: true);
foreach ($_GET["disabled"]??[] as $idx => $feed) {
@@ -153,8 +214,8 @@ foreach ($feeds as $name => $data) {
if (!isset($data["linkrel"])) $data["linkrel"] = "alternate";
foreach(load_cached($data["ttl"], $data["url"], $data["linkrel"]) as $entry) {
- $entry["source"] = $name;
- $entry["home"] = @$data["home"];
+ $entry["source"] = htmlentities($name);
+ $entry["home"] = htmlentities(@$data["home"]);
$combined[] = $entry;
}
}
@@ -205,16 +266,16 @@ $base = parse_url($_SERVER["REQUEST_URI"], PHP_URL_PATH);
<?php endif;
foreach ($combined as $entry): ?>
<article>
- <h1><a target="<?= htmlentities($config['link_target']) ?>" href="<?= htmlentities($entry['link']) ?>"><?= htmlentities($entry['title'] ?? "[[[No Title]]]") ?></a></h1>
- <span class="source" data-source="<?= htmlentities($entry['source']) ?>">
+ <h1><a target="<?= $config['link_target'] ?>" href="<?= $entry['link'] ?>"><?= $entry['title'] ?></a></h1>
+ <span class="source" data-source="<?= $entry['source'] ?>">
<?php if ($entry['home']): ?>
- <a target="<?= htmlentities($config['link_target']) ?>" href="<?= htmlentities($entry['home']) ?>"><?= htmlentities($entry['source']) ?></a>
+ <a target="<?= $config['link_target'] ?>" href="<?= $entry['home'] ?>"><?= $entry['source'] ?></a>
<?php else: ?>
- <?= htmlentities($entry['source']) ?>
+ <?= $entry['source'] ?>
<?php endif; ?>
</span>
&bullet;
- <time datetime="<?= htmlentities($entry['date']->format(DateTime::ISO8601)) ?>">
+ <time datetime="<?= $entry['date']->format(DateTime::ISO8601) ?>">
<?= htmlentities($entry['date']->format($config['date_fmt'])) ?>
</time>
</article>