Teach CM scraper to bypass the personal data request
authorMagnus Hagander <magnus@hagander.net>
Sun, 25 Feb 2018 14:07:37 +0000 (15:07 +0100)
committerMagnus Hagander <magnus@hagander.net>
Sun, 25 Feb 2018 14:07:37 +0000 (15:07 +0100)
The website have started requesting a landline phonenumber. At least for
now it's possible to bypass this by asking to be reminded later, so
teach the scraper to do just that.

postgresqleu/cmutuel/management/commands/cmscrape.py

index 764d11f2faa06e6d5a03b3038cddaeacfa12f0cd..51756b1779b24271e768aef05df4f83fef97b8a5 100755 (executable)
@@ -71,8 +71,11 @@ class CurlWrapper(object):
                        (c,s) = self.get(fetchpage)
                if c.getinfo(pycurl.RESPONSE_CODE) != 302:
                        raise CommandError("Supposed to receive 302 for %s, got %s" % (fetchpage, c.getinfo(c.RESPONSE_CODE)))
-               if c.getinfo(pycurl.REDIRECT_URL) != redirectto:
-                       raise CommandError("Received unexpected redirect from %s to '%s'" % (fetchpage, c.getinfo(pycurl.REDIRECT_URL)))
+               if not isinstance(redirectto, list):
+                       redirrectto = [redirectto, ]
+               if not c.getinfo(pycurl.REDIRECT_URL) in redirectto:
+                       raise CommandError("Received unexpected redirect from %s to '%s' (expected %s)" % (fetchpage, c.getinfo(pycurl.REDIRECT_URL), redirectto))
+               return c.getinfo(pycurl.REDIRECT_URL)
 
 
 class Command(BaseCommand):
@@ -100,8 +103,13 @@ class Command(BaseCommand):
                # Follow a redirect chain to collect more cookies
                curl.expect_redirect('https://www.creditmutuel.fr/en/banque/pageaccueil.html',
                                                         'https://www.creditmutuel.fr/en/banque/paci_engine/engine.aspx')
-               curl.expect_redirect('https://www.creditmutuel.fr/en/banque/paci_engine/engine.aspx',
-                                                        'https://www.creditmutuel.fr/en/banque/homepage_dispatcher.cgi')
+               got_redir = curl.expect_redirect('https://www.creditmutuel.fr/en/banque/paci_engine/engine.aspx',
+                                                                                ['https://www.creditmutuel.fr/en/banque/homepage_dispatcher.cgi',
+                                                                                'https://www.creditmutuel.fr/en/banque/paci_engine/static_content_manager.aspx'])
+               if got_redir == 'https://www.creditmutuel.fr/en/banque/paci_engine/static_content_manager.aspx':
+                       # Got the "please fill out your personal data" form. So let's bypass it
+                       curl.expect_redirect('https://www.creditmutuel.fr/en/banque/paci_engine/static_content_manager.aspx?_productfilter=PACI&_pid=ContentManager&_fid=DoStopPaciAndRemind',
+                                                                'https://www.creditmutuel.fr/en/banque/homepage_dispatcher.cgi')
 
                # Download the form
                if verbose: self.stdout.write("Downloading form...")