Merge pull request #376 from Gelmir/feedreader_atom

Support Atom feeds
2025-01-24 18:44:52 +08:00 · 2013-07-02 10:14:02 -07:00 · 2013-07-02 10:14:02 -07:00 · a29f858f77
commit a29f858f77
parent bc605fe09e 7c1e91f256
2 changed files with 132 additions and 3 deletions
--- a/src/rss/rssparser.cpp
+++ b/src/rss/rssparser.cpp
@ -35,6 +35,7 @@
 #include <QRegExp>
 #include <QStringList>
 #include <QVariant>
+#include <QTextDocument>

 struct ParsingJob {
  QString feedUrl;
@ -236,7 +237,7 @@ void RssParser::run()
    if (!m_queue.empty()) {
      ParsingJob job = m_queue.dequeue();
      m_mutex.unlock();
-      parseRSS(job);
+      parseFeed(job);
    } else {
      qDebug() << Q_FUNC_INFO << "Thread is waiting.";
      m_waitCondition.wait(&m_mutex);
@ -326,8 +327,129 @@ void RssParser::parseRSSChannel(QXmlStreamReader& xml, const QString& feedUrl)
  }
 }

+void RssParser::parseAtomArticle(QXmlStreamReader& xml, const QString& feedUrl, const QString& baseUrl)
+{
+  QVariantHash article;
+  bool double_content = false;
+
+  while(!xml.atEnd()) {
+    xml.readNext();
+
+    if(xml.isEndElement() && xml.name() == "entry")
+      break;
+
+    if (xml.isStartElement()) {
+      if (xml.name() == "title") {
+        // Workaround for CDATA (QString cannot parse html escapes on it's own)
+        QTextDocument doc;
+        doc.setHtml(xml.readElementText());
+        article["title"] = doc.toPlainText();
+      }
+      else if (xml.name() == "link") {
+        QString theLink = ( xml.attributes().isEmpty() ?
+                              xml.readElementText() :
+                              xml.attributes().value("href").toString() );
+
+        // Atom feeds can have relative links, work around this and
+        // take the stress of figuring article full URI from UI
+
+        // Assemble full URI
+        article["news_link"] = ( baseUrl.isEmpty() ?
+                                   theLink :
+                                   baseUrl + theLink );
+      }
+      else if (xml.name() == "summary" || xml.name() == "content"){
+        if(double_content) { // Duplicate content -> ignore
+          xml.readNext();
+
+          while(xml.name() != "summary" && xml.name() != "content")
+            xml.readNext();
+
+          continue;
+        }
+
+        // Try to also parse broken articles, which don't use html '&' escapes
+        // Actually works great for non-broken content too
+        QString feedText = xml.readElementText(QXmlStreamReader::IncludeChildElements);
+        if (!feedText.isEmpty())
+          article["description"] = feedText;
+
+        double_content = true;
+      }
+      else if (xml.name() == "updated"){
+        // ATOM uses standard compliant date, don't do fancy stuff
+        QDateTime articleDate = QDateTime::fromString(xml.readElementText(), Qt::ISODate);
+        article["date"] = ( articleDate.isValid() ?
+                              articleDate :
+                              QDateTime::currentDateTime() );
+      }
+      else if (xml.name() == "author") {
+        xml.readNext();
+        while(xml.name() != "author") {
+          if(xml.name() == "name")
+            article["author"] = xml.readElementText();
+          xml.readNext();
+        }
+      }
+      else if (xml.name() == "id")
+        article["id"] = xml.readElementText();
+    }
+  }
+
+  if (!article.contains("id")) {
+    // Item does not have a guid, fall back to some other identifier
+    const QString link = article.value("news_link").toString();
+    if (!link.isEmpty())
+      article["id"] = link;
+    else {
+      const QString title = article.value("title").toString();
+      if (!title.isEmpty())
+        article["id"] = title;
+      else {
+        qWarning() << "Item has no guid, link or title, ignoring it...";
+        return;
+      }
+    }
+  }
+
+  emit newArticle(feedUrl, article);
+}
+
+void RssParser::parseAtomChannel(QXmlStreamReader& xml, const QString& feedUrl)
+{
+  qDebug() << Q_FUNC_INFO << feedUrl;
+  Q_ASSERT(xml.isStartElement() && xml.name() == "feed");
+
+  QString baseURL = xml.attributes().value("xml:base").toString();
+
+  while(!xml.atEnd()) {
+    xml.readNext();
+
+    if (xml.isStartElement()) {
+      if (xml.name() == "title") {
+        QString title = xml.readElementText();
+        emit feedTitle(feedUrl, title);
+      }
+      else if (xml.name() == "updated") {
+        QString lastBuildDate = xml.readElementText();
+        if (!lastBuildDate.isEmpty()) {
+          QMutexLocker locker(&m_mutex);
+          if (m_lastBuildDates.value(feedUrl) == lastBuildDate) {
+            qDebug() << "The RSS feed has not changed since last time, aborting parsing.";
+            return;
+          }
+          m_lastBuildDates[feedUrl] = lastBuildDate;
+        }
+      }
+      else if (xml.name() == "entry") {
+        parseAtomArticle(xml, feedUrl, baseURL);
+      }
+    }
+  }
+}
+
 // read and create items from a rss document
-void RssParser::parseRSS(const ParsingJob& job)
+void RssParser::parseFeed(const ParsingJob& job)
 {
  qDebug() << Q_FUNC_INFO << job.feedUrl << job.filePath;
  QFile fileRss(job.filePath);
@ -352,6 +474,11 @@ void RssParser::parseRSS(const ParsingJob& job)
        }
      }
      break;
+    }
+    else if (xml.name() == "feed") { // Atom feed
+      parseAtomChannel(xml, job.feedUrl);
+      found_channel = true;
+      break;
    } else {
      qDebug() << "Skip root item: " << xml.name();
      xml.skipCurrentElement();
--- a/src/rss/rssparser.h
+++ b/src/rss/rssparser.h
@ -61,7 +61,9 @@ protected:
  static QDateTime parseDate(const QString& string);
  void parseRssArticle(QXmlStreamReader& xml, const QString& feedUrl);
  void parseRSSChannel(QXmlStreamReader& xml, const QString& feedUrl);
-  void parseRSS(const ParsingJob& job);
+  void parseAtomArticle(QXmlStreamReader& xml, const QString& feedUrl, const QString& baseUrl);
+  void parseAtomChannel(QXmlStreamReader& xml, const QString& feedUrl);
+  void parseFeed(const ParsingJob& job);
  void reportFailure(const ParsingJob& job, const QString& error);

 private: