From: Magnus Hagander Date: Mon, 19 Mar 2018 11:26:04 +0000 (+0100) Subject: Automatically follow http->https redirects X-Git-Url: http://git.postgresql.org/gitweb/static/%7B%7Bpguslink%28?a=commitdiff_plain;h=661ef25b76b88b7a798f56ef58be840f91ec1e47;p=hamn.git Automatically follow http->https redirects We only follow the redirect if the URL is *exactly* the same except it has https intead of http in it. But this is a very common scenario these days, so let's simplify that one. In particular, it makes no sense to re-moderate a blog after that change has been made. --- diff --git a/hamnadmin/hamnadmin/register/forms.py b/hamnadmin/hamnadmin/register/forms.py index e72f710..e45f7d9 100644 --- a/hamnadmin/hamnadmin/register/forms.py +++ b/hamnadmin/hamnadmin/register/forms.py @@ -5,7 +5,7 @@ from django.conf import settings from models import Blog -from hamnadmin.util.aggregate import FeedFetcher +from hamnadmin.util.aggregate import FeedFetcher, ParserGotRedirect import urllib import requests @@ -41,6 +41,8 @@ class BlogEditForm(forms.ModelForm): fetcher = FeedFetcher(feedobj, _trace, False) try: entries = list(fetcher.parse()) + except ParserGotRedirect: + raise forms.ValidationError("This URL returns a permanent redirect") except Exception, e: raise forms.ValidationError("Failed to retreive and parse feed: %s" % e) if len(entries) == 0: diff --git a/hamnadmin/hamnadmin/register/management/commands/aggregate_feeds.py b/hamnadmin/hamnadmin/register/management/commands/aggregate_feeds.py index e925614..cc50e66 100644 --- a/hamnadmin/hamnadmin/register/management/commands/aggregate_feeds.py +++ b/hamnadmin/hamnadmin/register/management/commands/aggregate_feeds.py @@ -5,10 +5,11 @@ import gevent from django.core.management.base import BaseCommand, CommandError from django.db import transaction +from django.db.models import Q from django.conf import settings from hamnadmin.register.models import Blog, Post, AggregatorLog -from hamnadmin.util.aggregate import FeedFetcher +from hamnadmin.util.aggregate import FeedFetcher, ParserGotRedirect from hamnadmin.mailqueue.util import send_simple_mail from hamnadmin.util.varnish import purge_root_and_feeds @@ -60,7 +61,39 @@ class Command(BaseCommand): try: with transaction.atomic(): for feed, results in pr.get(): - if isinstance(results, Exception): + if isinstance(results, ParserGotRedirect): + # Received a redirect. If this is a redirect for exactly the same URL just + # from http to https, special case this and allow it. For any other redirect, + # we don't follow it since it might no longer be a properly filtered feed + # for example. + if results.url == feed.feedurl: + # Redirect to itself! Should never happen, of course. + AggregatorLog(feed=feed, success=False, + info="Feed returned redirect loop to itself!").save() + elif results.url == feed.feedurl.replace('http://', 'https://'): + # OK, update it! + AggregatorLog(feed=feed, success=True, + info="Feed returned redirect to https, updating registration").save() + send_simple_mail(settings.EMAIL_SENDER, + feed.user.email, + "Your blog at Planet PostgreSQL redirected", + u"The blog aggregator at Planet PostgreSQL has picked up a redirect for your blog.\nOld URL: {0}\nNew URL: {1}\n\nThe database has been updated, and new entries will be fetched from the secure URL in the future.\n".format(feed.feedurl, results.url), + sendername="Planet PostgreSQL", + receivername=u"{0} {1}".format(feed.user.first_name, feed.user.last_name), + ) + send_simple_mail(settings.EMAIL_SENDER, + settings.NOTIFICATION_RECEIVER, + "Blog redirect detected on Planet PostgreSQL", + u"The blog at {0} by {1}\nis returning a redirect to a https version of itself.\n\nThe database has automatically been updated, and will start fetching using https in the future,\n\n".format(feed.feedurl, feed.user), + sendername="Planet PostgreSQL", + receivername="Planet PostgreSQL Moderators", + ) + feed.feedurl = results.url + feed.save() + else: + AggregatorLog(feed=feed, success=False, + info="Feed returned redirect (http 301)").save() + elif isinstance(results, Exception): AggregatorLog(feed=feed, success=False, info=results).save() @@ -162,6 +195,8 @@ class Command(BaseCommand): self.trace("Fetching %s since %s" % (fetcher.feed.feedurl, since)) try: entries = list(fetcher.parse(since)) + except ParserGotRedirect, e: + return (fetcher.feed, e) except Exception, e: self.stderr.write("Failed to fetch '%s': %s" % (fetcher.feed.feedurl, e)) return (fetcher.feed, e) diff --git a/hamnadmin/hamnadmin/util/aggregate.py b/hamnadmin/hamnadmin/util/aggregate.py index 51950e6..81f8c69 100644 --- a/hamnadmin/hamnadmin/util/aggregate.py +++ b/hamnadmin/hamnadmin/util/aggregate.py @@ -7,6 +7,11 @@ import feedparser from hamnadmin.register.models import Post +class ParserGotRedirect(Exception): + def __init__(self, url): + self.url = url + super(Exception, self).__init__() + class FeedFetcher(object): def __init__(self, feed, tracefunc=None, update=True): self.feed = feed @@ -39,8 +44,12 @@ class FeedFetcher(object): # Not modified return + if parser.status == 301 and hasattr(parser, 'href'): + # Permanent redirect. Bubble this up with an exception and let the caller + # handle it. + raise ParserGotRedirect(parser.href) + if parser.status != 200: - # XXX: follow redirect? raise Exception('Feed returned status %s' % parser.status) self._trace("Fetched %s, status %s" % (self.feed.feedurl, parser.status))