aboutsummaryrefslogblamecommitdiffstats
path: root/rss.php
blob: e91b1fff18cf650096f2c04cd8f098477bbac4b7 (plain) (tree)
1
2
3
4
5
6
7
8
                                 

                                    

                                                                                                      
 

                                                                              





































                                                          






                                                              

                                 
                                                                                                























                                                                                                


                                                                        

                                                           



                                                                   
                                                           




                                                                              
 









                                                                                       
                                                                         
                                                                          


                                                      
                   





                                                            


                                                                        





























                                                                                                 

       


                              
                                                                             
                                                                     
                                                      
                                                                      







                                                              
                                                          


                                                      
                   





                     
                                                                                                             




                                                 
                                                  







                                                 



                                                                      

                                 
                           


                                                                   






                                                                     












                                                                       





                                                                           

                                      














                                                              




















                                                               
                            

                                                                                                      

                                                  
 
                                   

                               
                                      

                             



                                    
     
 















                                                                                          
                        
                                                          










                                             
                                                                          


                                                                            
 




                                                                                
                                                                                   









                                                                            
 
                                       
                                                   
                               
                                            
                                                                                                                                    
                       
 


                                              















                                                           
            

                               
                                     
                                                                                                          
                      
                               

                       
 



                                                
 
              
                                                                        








                                                                        
<?php define("VERSION", "0.0.2");
/* --- FEEDS - EDIT AS NEEDED --- */

$feeds["OTW News"]       ["url"]  = "https://www.transformativeworks.org/category/announcement/feed/";
$feeds["OTW News"]       ["mode"] = "content";

$feeds["Dreamwidth News"]["url"]  = "https://dw-news.dreamwidth.org/data/rss";
$feeds["Dreamwidth News"]["home"] = "https://dw-news.dreamwidth.org/";

/* --- CONFIG - EDIT AS NEEDED --- */


/// Directory to store RSS cache.
///
/// Multiple instances can share one dir.
$config["cache_dir"] = "/tmp/rss_dot_php";


/// Custom CSS
$config["custom_css"] = <<<'EOC'

/* custom CSS goes here! */

EOC;


/// Document Language
$config["lang"] = "en";


/// Date Format
///
/// Displayed under every article, see
/// <https://www.php.net/manual/en/datetime.format.php>
/// for documentation.
$config["date_fmt"] = "l, M jS, Y, H:i T";


/// Timezone
///
/// A value of type DateTimeZone, see
/// <https://www.php.net/manual/en/class.datetimezone.php>
/// for documentation.
$config["timezone"] = new DateTimeZone('UTC');


/// <a target=? >
///
/// What to set for the target= attribute on generated links.
/// _top will redirect the main tab, _blank wil make a nwe tab
$config["link_target"] = "_top";


/* --- CODE - DO NOT TOUCH --- */

function load_rss(string $uri, string $linkrel = "alternate", ?bool $allow_html = NULL): array {
  global $config;

  $xml = file_get_contents($uri);

  // if the file doesn't contain an encoding, attempt to read it from http headers and re-encode
  if (!preg_match("/^[^>]+encoding/", $xml) && str_starts_with($uri, "http")) {
    foreach ($http_response_header as $header) {
      if (!str_starts_with(strtolower($header), "content-type")) continue;
      if (preg_match("/(?<=charset=)[a-z0-9_-]+/i", $header, $matches)) {
        $xml = iconv($matches[0], "UTF-8", $xml);
        $doc = new DOMDocument(encoding: "UTF-8");
      }
      break;
    }
  }

  $doc ??= new DOMDocument();
  $doc->loadXML($xml);

  if ($doc->documentElement->nodeName == "rss") {
    // TODO: better rss / atom sniffing
    foreach ($doc->getElementsByTagName("item") as $node) {
      $data["title"] = $node->getElementsByTagName("title")
                            ?->item(0)?->textContent;
      $data["title"] ??= "[[[No Title]]]";
      $data["title"] = htmlentities(html_entity_decode($data["title"]));

      $data["link"] ??= $node->getElementsByTagName("link")
                             ?->item(0)?->textContent;
      $data["link"] ??= htmlentities($data["link"]);

      // assume rss is html by default
      $data["content"] = $node->getElementsByTagName("description")
                              ?->item(0)?->textContent??"";
      if ($allow_html === TRUE || $allow_html === NULL) {
        $data["content"] = strip_html($data["content"]);
      } else {
        $data["content"] = htmlentities(html_entity_decode($data["content"]));
      }

      foreach($node->getElementsByTagNameNS("http://search.yahoo.com/mrss/", "content")
                   ->getIterator() as $media) {
        $type = $media->getAttribute("type");
        if (str_starts_with($type, "image/")) {
          $data["images"][] = htmlentities($media->getAttribute("url"));
        } elseif (str_starts_with($type, "video/")) {
          $data["videos"][] = htmlentities($media->getAttribute("url"));
        }
      }

      $data["date"] = new DateTime($node->getElementsByTagName("pubDate")
                                        ?->item(0)?->textContent ?? '@0');
      $data["date"]->setTimezone($config["timezone"]);

      $parsed[] = $data;
      unset($data);
    }
  } else {
    // assume atom
    foreach ($doc->getElementsByTagName("entry") as $node) {
      $data["title"] = $node->getElementsByTagName("title")
                          ?->item(0)?->textContent;
      $data["title"] ??= "[[[No Title]]]";
      $data["title"] = htmlentities(html_entity_decode($data["title"]));

      foreach ($node->getElementsByTagName("content")->getIterator() as $content) {
        $type = $content->getAttribute("type");

        if ($type === "text") {
          // if we prefer non-html, overwrite html with text
          if ($allow_html === FALSE) {
            $data["content"] = htmlentities($content->textContent);
          } else {
            $data["content"] ??= htmlentities($content->textContent);
          }
        } elseif ($type === "html" || $type === "xhtml") {
          if ($allow_html === FALSE) {
            $data["content"] ??= htmlentities($content->textContent);
          } else {
            $data["content"] = strip_html($content->textContent);
          }
        } elseif (str_starts_with($type, "image/")) {
          if ($media->hasAttribute("src"))
            $data["images"][] = htmlentities($media->getAttribute("src"));
          elseif (base64_decode(trim($media->textContent)) !== FALSE)
            $data["images"][] = htmlentities("data:".$type.";base64,".trim($media->textContent));
        } elseif (str_starts_with($type, "video/")) {
          if ($media->hasAttribute("src"))
            $data["videos"][] = htmlentities($media->getAttribute("src"));
          elseif (base64_decode(trim($media->textContent)) !== FALSE)
            $data["videos"][] = htmlentities("data:".$type.";base64,".trim($media->textContent));
        } elseif ($node->hasAttribute("src")) {
          $data["links"][] = $node->getAttribute("src");
        }
        // TODO: possible markdown et al. handling?
      }

      $data["content"] ??= "";

      $data["links"] ??= [];
      foreach ($node->getElementsByTagName("link")->getIterator() as $link) {
        $date["links"][] = htmlentities($link->getAttribute("href"));
        if ($link->getAttribute("rel") === $linkrel) {
          $data["link"] ??= htmlentities($link->getAttribute("href"));
        }
      }
      $data["link"] ??= @$data["links"][0];

      $data["date"] = $node->getElementsByTagName("published")
                           ?->item(0)?->textContent;
      $data["date"] ??= $node->getElementsByTagName("updated")
                             ?->item(0)?->textContent;
      $data["date"] = new DateTime($data["date"] ?? '@0');
      $data["date"]->setTimezone($config["timezone"]);

      $parsed[] = $data;
      unset($data);
    }
  }

  return $parsed??[];
}

function load_cached(int $ttl, string $uri, string $linkrel = "alternate", ?bool $allow_html = NULL): array {
  global $config;
  $path = $config["cache_dir"]."/".md5($uri);
//  echo $path."\n";
  if ((@filemtime($path) ?? 0) + $ttl < time()) {
//    echo "cache miss, loading over network\n";
    $data = load_rss($uri, $linkrel, $allow_html);
    file_put_contents($path, serialize($data));
    return $data;
  } else {
//    echo "cache hit, loading from file\n";
    return unserialize(file_get_contents($path));
  }
}

// potentially unsafe, shouldn't matter cause source is always trusted
// TODO: sniff for 8.4 Dom\HTMLDocument when 8.4 releases
// <https://www.php.net/manual/en/domdocument.loadhtml.php>
function strip_html(string $html): string {
  if ($html === "") return $html;

  $doc = new DomDocument();

  // this is a really ugly hack but libxml has left me no choice :(
  @$doc->loadHTML("<meta charset='UTF-8'>".$html);

  foreach($doc->getElementsByTagName("style")->getIterator() as $el)
    $el->remove();
  foreach($doc->getElementsByTagName("script")->getIterator() as $el)
    $el->remove();
  foreach($doc->getElementsByTagName("link")->getIterator() as $el)
    $el->remove();
  foreach($doc->getElementsByTagName("meta")->getIterator() as $el)
    $el->remove();
  foreach($doc->getElementsByTagName("base")->getIterator() as $el)
    $el->remove();
  foreach($doc->getElementsByTagName("title")->getIterator() as $el)
    $el->remove();
  foreach($doc->getElementsByTagName("template")->getIterator() as $el)
    $el->remove();
  foreach($doc->getElementsByTagName("slot")->getIterator() as $el)
    $el->remove();

  foreach($doc->getElementsByTagName("img")->getIterator() as $el)
    $el->setAttribute("loading", "lazy");

  foreach($doc->getElementsByTagName("*")->getIterator() as $el) {
    if (str_starts_with($el->getAttribute("href"), "javascript:"))
      $el->setAttribute("javascript:alert('Link stripped for security.')");
    if (str_starts_with($el->getAttribute("src"), "javascript:"))
      $el->setAttribute("javascript:alert('Link stripped for security.')");

    @$el->removeAttribute("autoplay");
  }

  return implode(
    array_map(
      fn($x) => $doc->saveHTML($x),
      iterator_to_array(
        $doc->getElementsByTagName("body")
            ->item(0)
            ->childNodes
            ->getiterator())));
}

// code begins here
$config["link_target"] = htmlentities($config["link_target"]);

@mkdir($config["cache_dir"], recursive: true);

foreach ($_GET["disabled"]??[] as $idx => $feed) {
  if (!array_key_exists($feed, $feeds)) {
    unset($_GET["disabled"][$idx]);
    continue;
  }
  $off_feeds[$feed] = @$feeds[$feed];
  unset($feeds[$feed]);
}

$combined = [];
// Real Feed Processing Happens Here
foreach ($feeds as $name => $data) {
  if (!isset($data["url"])) {
    error_log("Feed \"$name\" missing url. Ignoring.");
    continue;
  }
  if (!isset($data["ttl"])) $data["ttl"] = 3600;
  if (!isset($data["linkrel"])) $data["linkrel"] = "alternate";

  $data["mode"] ??= "title";

  foreach(load_cached($data["ttl"], $data["url"], $data["linkrel"], @$data["allow_html"]) as $entry) {
    $entry["source"] = htmlentities($name);
    $entry["home"] = htmlentities(@$data["home"]);

    if ($data["mode"] == "title") {
      unset($entry["content"]);
    }
    if ($data["mode"] == "no_title") {
      unset($entry["title"]);
    }

    if (@$data["media"] === FALSE) {
      unset($entry["images"]);
      unset($entry["videos"]);
    }

    $combined[] = $entry;
  }
}

// reverse-chronological by default
usort($combined, fn($a, $b) => $b["date"]->getTimestamp() <=> $a["date"]->getTimestamp());

if (isset($_GET["reverse"]))
  $combined = array_reverse($combined);

$base = parse_url($_SERVER["REQUEST_URI"], PHP_URL_PATH);

?>
<!doctype html>
<html lang="<?= $config['lang'] ?>">
<head>
  <meta charset="UTF-8">
  <style> object, img, video { max-width: 60vw; } </style>
  <style><?= $config['custom_css'] ?></style>
</head>
<body>
  <nav>
    <div>
      <b>Toggle Feeds</b>:
<?php foreach ($feeds??[] as $name => $data):
  $query = $_GET;
  $query["disabled"][] = $name;
  $uri = $base."?".http_build_query($query);
?>
        <span class="rss-source" data-source="<?= htmlentities($name) ?>">
          <a href="<?= htmlentities($uri) ?>"><?= htmlentities($name) ?></a>
        </span>
<?php endforeach; ?>

<?php foreach ($off_feeds??[] as $name => $data):
  $query = $_GET;
  $query["disabled"] = array_filter($query["disabled"], fn($x) => $x !== $name);
  $uri = $base."?".http_build_query($query);
?>
        <span class="rss-source disabled" data-source="<?= htmlentities($name) ?>">
          <a href="<?= htmlentities($uri) ?>"><?= htmlentities($name) ?></a>
        </span>
<?php endforeach; ?>
    </div>
  </nav>
  <main>
<?php if (!count($combined) && isset($_GET['disabled'])): ?>
<h1>Looks like you filtered out everything...</h1>
<p>Try unfiltering some feeds!</p>
<?php endif;

      foreach ($combined as $entry): ?>
    <article data-source="<?= $entry['source'] ?>">
      <div class="rss-content">
        <?php if(isset($entry['title'])): ?>
        <h1 class="rss-title"><a target="<?= $config['link_target'] ?>" href="<?= $entry['link'] ?>"><?= $entry['title'] ?></a></h1>
        <?php endif; ?>

        <?php if(isset($entry['content'])): ?>
        <div><?= $entry['content'] ?></div>
        <?php endif; ?>

        <?php if(count($entry['images']??[])): ?>
        <div class="rss-images">
          <?php foreach($entry['images'] as $media): ?>
          <img loading="lazy" src="<?= $media ?>"></object>
          <?php endforeach; ?>
        </div>
        <?php endif; ?>

        <?php if(count($entry['videos']??[])): ?>
        <div class="rss-videos">
          <?php foreach($entry['videos'] as $media): ?>
          <video controls src="<?= $media ?>"></video>
          <?php endforeach; ?>
        </div>
        <?php endif; ?>
      </div>

      <span class="rss-source">
        <?php if ($entry['home']): ?>
        <a target="<?= $config['link_target'] ?>" href="<?= $entry['home'] ?>"><?= $entry['source'] ?></a>
        <?php else: ?>
        <?= $entry['source'] ?>
        <?php endif; ?>
      </span>

      <?php if(!isset($entry['title'])): ?>
      &bullet;
      <a href="<?= $entry['link'] ?>">Source</a>
      <?php endif; ?>

      &bullet;
      <time datetime="<?= $entry['date']->format(DateTime::ISO8601) ?>">
        <?= htmlentities($entry['date']->format($config['date_fmt'])) ?>
      </time>
    </article>
<?php endforeach; ?>
  </main>
  <!-- generated by rss_dot_php <?= VERSION ?>
       https://git.aleteoryx.me/cgit/rss_dot_php -->
</body>
</html>