From 1551d2bda2ad95231f9cde3b7cf94a4a1e01a75f Mon Sep 17 00:00:00 2001
From: Aleteoryx <alyx@aleteoryx.me>
Date: Thu, 14 Nov 2024 16:23:52 -0500
Subject: parse out html

---
 rss.php | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 74 insertions(+), 13 deletions(-)

diff --git a/rss.php b/rss.php
index 14caeaa..783a119 100644
--- a/rss.php
+++ b/rss.php
@@ -53,7 +53,7 @@ $config["link_target"] = "_top";
 
 /* --- CODE - DO NOT TOUCH --- */
 
-function load_rss(string $uri, string $linkrel = "alternate"): array {
+function load_rss(string $uri, string $linkrel = "alternate", bool? $allow_html = NULL): array {
   global $config;
 
   $xml = file_get_contents($uri);
@@ -78,11 +78,24 @@ function load_rss(string $uri, string $linkrel = "alternate"): array {
     foreach ($doc->getElementsByTagName("item") as $node) {
       $data["title"] = $node->getElementsByTagName("title")
                             ?->item(0)?->textContent;
+      $data["title"] ??= "[[[No Title]]]";
+      $data["title"] = htmlentities(html_entity_decode($data["title"]));
+
       $data["link"] ??= $node->getElementsByTagName("link")
                              ?->item(0)?->textContent;
+      $data["link"] ??= htmlentities($data["link"]);
+
+      // assume rss is html by default
+      $data["content"] = $node->getElementsByTagName("description")
+                              ?->item(0)?->textContent;
+      if ($allow_html === TRUE || $allow_html === NULL) {
+        $data["content"] = strip_html($data["content"]);
+      } else {
+        $data["content"] = htmlentities(html_entity_decode($data["content"]));
+      }
 
       $data["date"] = new DateTime($node->getElementsByTagName("pubDate")
-                                        ->item(0)->textContent);
+                                        ?->item(0)?->textContent ?? '@0');
       $data["date"]->setTimezone($config["timezone"]);
 
       $parsed[] = $data;
@@ -92,12 +105,26 @@ function load_rss(string $uri, string $linkrel = "alternate"): array {
     foreach ($doc->getElementsByTagName("entry") as $node) {
       $data["title"] = $node->getElementsByTagName("title")
                           ?->item(0)?->textContent;
+      $data["title"] ??= "[[[No Title]]]";
+      $data["title"] = htmlentities(html_entity_decode($data["title"]));
+
+      $data["content"] = $node->getElementsByTagName("content")
+                              ?->item(0)?->textContent;
+
+      if ($node->getElementsByTagName("content")
+               ?->item(0)
+               ?->getAttribute("type") === "html" && $allow_html !== FALSE) {
+        $data["content"] = strip_html($data["content"])
+      } else {
+        $data["content"] = htmlentities(html_entity_decode($data["content"]));
+      }
+
       $data["links"] = [];
       foreach ($node->getElementsByTagName("link")->getIterator() as $link) {
-        $date["links"][] = ["rel" => $link->getAttribute("rel"),
-                          "href" => $link->getAttribute("href")];
+        $date["links"][] = ["rel" => htmlentities($link->getAttribute("rel")),
+                          "href" => htmlentities($link->getAttribute("href"))];
         if ($link->getAttribute("rel") === $linkrel) {
-          $data["link"] ??= $link->getAttribute("href");
+          $data["link"] ??= htmlentities($link->getAttribute("href"));
         }
       }
       $data["link"] ??= @$data["links"][0];
@@ -106,7 +133,7 @@ function load_rss(string $uri, string $linkrel = "alternate"): array {
                            ?->item(0)?->textContent;
       $data["date"] ??= $node->getElementsByTagName("updated")
                              ?->item(0)?->textContent;
-      $data["date"] = new DateTime($data["date"]);
+      $data["date"] = new DateTime($data["date"] ?? '@0');
       $data["date"]->setTimezone($config["timezone"]);
 
       $parsed[] = $data;
@@ -131,6 +158,40 @@ function load_cached(int $ttl, string $uri, string $linkrel = "alternate"): arra
   }
 }
 
+// potentially unsafe, shouldn't matter cause source is always trusted
+// TODO: sniff for 8.4 Dom\HTMLDocument when 8.4 releases
+// <https://www.php.net/manual/en/domdocument.loadhtml.php>
+function strip_html(string $html): string {
+  $doc = new DomDocument();
+  @$doc->loadHTML($html);
+
+  foreach($doc->getElementsByTagName("style")->getIterator() as $el)
+    $el->remove();
+  foreach($doc->getElementsByTagName("script")->getIterator() as $el)
+    $el->remove();
+  foreach($doc->getElementsByTagName("link")->getIterator() as $el)
+    $el->remove();
+
+  foreach($doc->getElementsByTagName("*")->getIterator() as $el) {
+    if (str_starts_with($el->getAttribute("href"), "javascript:"))
+      $el->setAttribute("javascript:alert('Link stripped for security.')");
+    if (str_starts_with($el->getAttribute("src"), "javascript:"))
+      $el->setAttribute("javascript:alert('Link stripped for security.')");
+  }
+
+  return implode(
+    array_map(
+      fn($x) => $doc->saveHTML($x),
+      iterator_to_array(
+        $doc->getElementsByTagName("body")
+            ->item(0)
+            ->childNodes
+            ->getiterator())));
+}
+
+// code begins here
+$config["link_target"] = htmlentities($config["link_target"]);
+
 @mkdir($config["cache_dir"], recursive: true);
 
 foreach ($_GET["disabled"]??[] as $idx => $feed) {
@@ -153,8 +214,8 @@ foreach ($feeds as $name => $data) {
   if (!isset($data["linkrel"])) $data["linkrel"] = "alternate";
 
   foreach(load_cached($data["ttl"], $data["url"], $data["linkrel"]) as $entry) {
-    $entry["source"] = $name;
-    $entry["home"] = @$data["home"];
+    $entry["source"] = htmlentities($name);
+    $entry["home"] = htmlentities(@$data["home"]);
     $combined[] = $entry;
   }
 }
@@ -205,16 +266,16 @@ $base = parse_url($_SERVER["REQUEST_URI"], PHP_URL_PATH);
 <?php endif;
       foreach ($combined as $entry): ?>
     <article>
-      <h1><a target="<?= htmlentities($config['link_target']) ?>" href="<?= htmlentities($entry['link']) ?>"><?= htmlentities($entry['title'] ?? "[[[No Title]]]") ?></a></h1>
-      <span class="source" data-source="<?= htmlentities($entry['source']) ?>">
+      <h1><a target="<?= $config['link_target'] ?>" href="<?= $entry['link'] ?>"><?= $entry['title'] ?></a></h1>
+      <span class="source" data-source="<?= $entry['source'] ?>">
         <?php if ($entry['home']): ?>
-        <a target="<?= htmlentities($config['link_target']) ?>" href="<?= htmlentities($entry['home']) ?>"><?= htmlentities($entry['source']) ?></a>
+        <a target="<?= $config['link_target'] ?>" href="<?= $entry['home'] ?>"><?= $entry['source'] ?></a>
         <?php else: ?>
-        <?= htmlentities($entry['source']) ?>
+        <?= $entry['source'] ?>
         <?php endif; ?>
       </span>
       &bullet;
-      <time datetime="<?= htmlentities($entry['date']->format(DateTime::ISO8601)) ?>">
+      <time datetime="<?= $entry['date']->format(DateTime::ISO8601) ?>">
         <?= htmlentities($entry['date']->format($config['date_fmt'])) ?>
       </time>
     </article>
-- 
cgit v1.2.3-70-g09d2