From 5698f2c8b93b8924f7bf0c18af5f65a0bac0dcaf Mon Sep 17 00:00:00 2001 From: Magnus Hagander Date: Tue, 24 Aug 2010 15:24:02 +0200 Subject: [PATCH] Store the Last-Modified time as received from the webserver. Use this for If-Modified-Since, instead of just using the timestamp of the latest blog post found. This should keep us from pulling the full feed from any blog that has modified the contents of a post (or comments) without updating the post date/GUID. --- aggregator.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/aggregator.py b/aggregator.py index 3d55ffe..862e97f 100755 --- a/aggregator.py +++ b/aggregator.py @@ -93,8 +93,20 @@ class Aggregator: guidisperma = True if self.StoreEntry(feedinfo[0], entry.id, entry.date, entry.link, guidisperma, entry.title, txt) > 0: numadded += 1 - if numadded > 0: - self.db.cursor().execute("UPDATE planet.feeds SET lastget=COALESCE((SELECT max(dat) FROM planet.posts WHERE planet.posts.feed=planet.feeds.id),'2000-01-01') WHERE planet.feeds.id=%(feed)s", {'feed': feedinfo[0]}) + + # Check if we got back a Last-Modified time + if hasattr(feed, 'modified') and feed['modified']: + # Last-Modified header retreived. If we did receive it, we will + # trust the content (assuming we can parse it) + self.db.cursor().execute("UPDATE planet.feeds SET lastget=%(date)s WHERE id=%(feed)s AND NOT lastget=%(date)s", { 'date': datetime.datetime(*feed['modified'][:6]), 'feed': feedinfo[0]}) + else: + # We didn't get a Last-Modified time, so set it to the entry date + # for the latest entry in this feed. Only do this if we have more + # than one entry. + if numadded > 0: + self.db.cursor().execute("UPDATE planet.feeds SET lastget=COALESCE((SELECT max(dat) FROM planet.posts WHERE planet.posts.feed=planet.feeds.id),'2000-01-01') WHERE planet.feeds.id=%(feed)s", {'feed': feedinfo[0]}) + + # Return the number of feeds we actually added return numadded def matches_filter(self, entry): -- 2.39.5