RSS / ATOM Feed Autodiscovery Class

A class that will autodiscover RSS and ATOM feeds from any supplied URL and package the metadata.

I have several facilities where users can enter a feed url and my code goes out and validates the feed, and adds it to a database table which is later used to get the "latest post" from all stored feeds and display them for users. Now that the <link rel="alternate" ...  standard is widely used, it made sense to accept any url - not just that of an RSS or ATOM feed, and to check for the presence of the autodiscovery links on the page, get the HREF link, and optionally retrieve the associated feed that way. I also wanted to wrap up the feed, it's URL, and other metadata into a small class that would make working with validated feeds easier.

I started out with an enum to identify feeds:

public enum FeedType
    {
        NONE,
        RSS,
        ATOM
    }
Next, I created my FeedMetaData class as a container to hold all the info:

public class FeedMetaData
    {
        public FeedMetaData()
        {
        }

        public FeedMetaData(FeedType feedType, XmlDocument content, bool isValid, string feedUrl)
        {
            FeedType = feedType;
            FeedContent = content;
            IsValid = isValid;
            FeedUrl = feedUrl;
        }

        public FeedType FeedType { get; set; }
        public XmlDocument FeedContent { get; set; }
        public string FeedUrl { get; set; }
        public bool IsValid { get; set; }
    }
Next, I create my FeedUtil class with it's DiscoverFeed method:

public class FeedUtil
    {
        public static FeedMetaData DiscoverFeed(string url)
        {
            var feedMetaData = new FeedMetaData();
            var wc = new WebClient();
            WebClient wc2 = null;
            bool IsFeed = false;
            bool IsHTML = true;
            string sDoc = null;
            try
            {
                sDoc = wc.DownloadString(url);
                // is it Xml? then it's a feed, so load and get the feed type
                if (sDoc.IndexOf("<?xml") > -1)
                {
                    //  BOM fix -chop off first 3 chars if present
                    sDoc = sDoc.Substring(sDoc.IndexOf("<?xml"));
                    if (sDoc.IndexOf("<rss") > -1)
                    {
                        IsFeed = true;
                        IsHTML = false;
                        feedMetaData.IsValid = true;
                        feedMetaData.FeedType = FeedType.RSS;
                        feedMetaData.FeedContent = new XmlDocument();
                        feedMetaData.FeedContent.LoadXml(sDoc);
                        feedMetaData.FeedUrl = url;
                    }
                    else if (sDoc.IndexOf("<feed") > -1)
                    {
                        IsFeed = true;
                        IsHTML = false;
                        feedMetaData.IsValid = true;
                        feedMetaData.FeedType = FeedType.ATOM;
                        feedMetaData.FeedContent = new XmlDocument();
                        feedMetaData.FeedContent.LoadXml(sDoc);
                        feedMetaData.FeedUrl = url;
                    }
                }
                else
                {
                    IsFeed = false;
                    IsHTML = true;
                    // find the rel link and get the actual feed if any
                    Collection<FeedMetaData> urls = LinkChecker.FindLinks("alternate", sDoc);
                    if (urls.Count > 0)
                        feedMetaData.FeedUrl = urls[0].FeedUrl;
                    if (feedMetaData.FeedUrl == null)
                        feedMetaData.IsValid = false;
                    else
                    {
                        wc2 = new WebClient();
                        sDoc = wc2.DownloadString(feedMetaData.FeedUrl);
                        feedMetaData.FeedContent = new XmlDocument();
                        //  BOM fix -chop off first 3 chars if present
                        sDoc = sDoc.Substring(sDoc.IndexOf("<?xml"));
                        feedMetaData.FeedContent.LoadXml(sDoc);
                        feedMetaData.IsValid = true;
                        feedMetaData.FeedType = urls[0].FeedType;
                    }
                }
            }

            catch (Exception ex)
            {
                feedMetaData.IsValid = false;
            }
            finally
            {
                wc.Dispose();
                if (wc2 != null)
                    wc2.Dispose();
            }
            return feedMetaData;
        }
    }
}
Finally I needed a way to parse the retrieved HTML for the autodiscovery links. Usually for this kind of work I would use Simon Mourier's HtmlAgilityPack, but in this case I decided to use a combination of REGEX and string manipulation to keep everything lightweight and have no external assembly dependencies. For this I created the  LinkChecker class:

using System;
using System.Collections.ObjectModel;
using System.Diagnostics;
using System.Text.RegularExpressions;

namespace FeedDiscovery
{
    public class LinkChecker
    {
        private const string PATTERN = "<head.*<link( [^>]*rel=\"{0}\"[^>]*)>.*</head>";
        private static readonly Regex HREF = new Regex("href=\"(.*)\"", RegexOptions.IgnoreCase | RegexOptions.Compiled);


        /// <summary>
        /// Finds the Feed links.
        /// </summary>
        /// <param name="type">The type.</param>
        /// <param name="html">The HTML.</param>
        /// <returns></returns>
        public static Collection<FeedMetaData> FindLinks(string type, string html)
        {
            MatchCollection matches = Regex.Matches(html, string.Format(PATTERN, type),
                                                    RegexOptions.IgnoreCase | RegexOptions.Singleline);

            var urls = new Collection<FeedMetaData>();
            FeedType feedType = FeedType.NONE;

            foreach (Match match in matches)
            {
                if (match.Groups.Count > 1)
                {
                    string link = match.Groups[1].Value;
                    Match hrefMatch = HREF.Match(link);
                    if (link.ToLower().IndexOf("application/rss+xml") > -1)
                        feedType = FeedType.RSS;
                    if (link.ToLower().IndexOf("application/atom+xml") > -1)
                        feedType = FeedType.ATOM;
                    if (hrefMatch.Groups.Count > 1)
                    {
                        Uri url;
                        string value = hrefMatch.Groups[1].Value;
                        if (Uri.TryCreate(value, UriKind.Absolute, out url))
                        {
                            var data = new FeedMetaData(feedType, null, true, url.AbsoluteUri);
                            urls.Add(data);
                        }
                    }
                }
            }
            return urls;
        }
    }
}
To wrap it all up, I made an ASP.NET "Test Harness" page that accepts any url, does the FeedDiscovery, and displays the results in a GridView. A Sample view looks like this:




I hope this little enterprise is useful to you! You can download the complete Visual Studio 2008 Solution here.
By Peter Bromberg   Popularity  (9144 Views)