[weboob] [PATCH 1/1] Implemented history operation for CrAgr backend

Xavier G xavier at tuxfamily.org
Fri Nov 12 22:38:04 CET 2010


Signed-off-by: Xavier G <xavier at tuxfamily.org>
---
 weboob/backends/cragr/backend.py             |    4 +-
 weboob/backends/cragr/browser.py             |   13 +++-
 weboob/backends/cragr/pages/accounts_list.py |  116 ++++++++++++++++++++++++++
 3 files changed, 130 insertions(+), 3 deletions(-)

diff --git a/weboob/backends/cragr/backend.py b/weboob/backends/cragr/backend.py
index 7327302..7ee9342 100644
--- a/weboob/backends/cragr/backend.py
+++ b/weboob/backends/cragr/backend.py
@@ -59,5 +59,5 @@ class CragrBackend(BaseBackend, ICapBank):
         return iter([])
 
     def iter_history(self, account):
-        """ TODO Not supported yet """
-        return iter([])
+        for history in self.browser.get_history(account):
+            yield history
diff --git a/weboob/backends/cragr/browser.py b/weboob/backends/cragr/browser.py
index d8df765..cc1a663 100644
--- a/weboob/backends/cragr/browser.py
+++ b/weboob/backends/cragr/browser.py
@@ -32,6 +32,7 @@ class Cragr(BaseBrowser):
         self.PAGES = {'https://%s/'              % website:   pages.LoginPage,
                       'https://%s/.*\.c.*'       % website:   pages.AccountsList,
                       'https://%s/login/process' % website:   pages.AccountsList,
+                      'https://%s/accounting/listOperations' % website: pages.AccountsList,
                      }
         BaseBrowser.__init__(self, *args, **kwargs)
 
@@ -74,11 +75,21 @@ class Cragr(BaseBrowser):
 
         l = self.get_accounts_list()
         for a in l:
-            if a.id == id:
+            if a.id == ('%s' % id):
                 return a
 
         return None
 
+    def get_history(self, account):
+        page_url = account.link_id
+        operations_count = 0
+        while (page_url):
+            self.location('https://%s%s' % (self.DOMAIN, page_url))
+            for page_operation in self.page.get_history(operations_count):
+                operations_count += 1
+                yield page_operation
+            page_url = self.page.next_page_url()
+
     #def get_coming_operations(self, account):
     #    if not self.is_on_page(pages.AccountComing) or self.page.account.id != account.id:
     #        self.location('/NS_AVEEC?ch4=%s' % account.link_id)
diff --git a/weboob/backends/cragr/pages/accounts_list.py b/weboob/backends/cragr/pages/accounts_list.py
index b789a46..f93a99a 100644
--- a/weboob/backends/cragr/pages/accounts_list.py
+++ b/weboob/backends/cragr/pages/accounts_list.py
@@ -16,8 +16,10 @@
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 
 
+import re
 from weboob.capabilities.bank import Account
 from .base import CragrBasePage
+from weboob.capabilities.bank import Operation
 
 class AccountsList(CragrBasePage):
     def get_list(self):
@@ -29,11 +31,13 @@ class AccountsList(CragrBasePage):
                 if div.getchildren()[0].tag == 'a':
                     # This is at least present on CA Nord-Est
                     account.label = ' '.join(div.find('a').text.split()[:-1])
+                    account.link_id = div.find('a').get('href', '')
                     account.id = div.find('a').text.split()[-1]
                     s = div.find('div').find('b').find('span').text
                 else:
                     # This is at least present on CA Toulouse
                     account.label = div.find('a').text.strip()
+                    account.link_id = div.find('a').get('href', '')
                     account.id = div.findall('br')[1].tail.strip()
                     s = div.find('div').find('span').find('b').text
                 balance = u''
@@ -45,3 +49,115 @@ class AccountsList(CragrBasePage):
                 account.balance = float(balance)
                 l.append(account)
         return l
+
+    def is_account_page(self):
+        # tested on CA Lorraine, Paris, Toulouse
+        title_spans = self.document.xpath('/html/body/div[@class="dv"]/span')
+        for title_span in title_spans:
+            title_text = title_span.text_content().strip().replace("\n", '')
+            if (re.match('.*Compte.*n.[0-9]+.*au.*', title_text)):
+                return True
+        return False
+
+    def next_page_url(self):
+        # tested on CA Lorraine, Paris, Toulouse
+        a = self.document.xpath('/html/body//div[@class="navlink"]//a[contains(text(), "Suite")]')
+        if not a:
+            return False
+        else:
+            return a[0].get('href', '')
+
+    def is_right_aligned_div(self, div_elmt):
+        return(re.match('.*text-align: ?right.*', div_elmt.get('style', '')))
+
+    def extract_text(self, xml_elmt):
+        data = u''
+        for text in xml_elmt.itertext():
+            data = data + u'%s ' % text
+        data = re.sub(' +', ' ', data.replace("\n", ' ').strip())
+        return data
+
+    def get_history(self, start_index = 0):
+        # tested on CA Lorraine, Paris, Toulouse
+        # avoir parsing the page as an account-dedicated page if it is not the case
+        if not self.is_account_page():
+            return
+
+        index = start_index
+        operation = False
+
+        body_elmt_list = self.document.xpath('/html/body/*')
+
+        # type of separator used in the page
+        separators = 'hr'
+        # How many <hr> elements do we have under the <body>?
+        sep_expected = len(self.document.xpath('/html/body/hr'))
+        if (not sep_expected):
+            # no <hr>? Then how many class-less <div> used as separators instead?
+            sep_expected = len(self.document.xpath('/html/body/div[not(@class) and not(@style)]'))
+            separators = 'div'
+
+        # the interesting divs are after the <hr> elements
+        interesting_divs = []
+        right_div_count = 0
+        left_div_count = 0
+        sep_found = 0
+        for body_elmt in body_elmt_list:
+            if (separators == 'hr' and body_elmt.tag == 'hr'):
+                sep_found += 1
+            elif (separators == 'div' and body_elmt.tag == 'div' and body_elmt.get('class', 'nope') == 'nope'):
+                sep_found += 1
+            elif (sep_found >= sep_expected and body_elmt.tag == 'div'):
+                # we just want <div> with dv class and a style attribute
+                if (body_elmt.get('class', '') != 'dv'):
+                    continue
+                if (body_elmt.get('style', 'nope') == 'nope'):
+                    continue
+                interesting_divs.append(body_elmt)
+                if (self.is_right_aligned_div(body_elmt)):
+                    right_div_count += 1
+                else:
+                    left_div_count += 1
+
+        # So, how are data laid out?
+        toulouse_way_of_life = (left_div_count == 2 * right_div_count)
+        # we'll have: one left-aligned div for the date, one right-aligned
+        # div for the amount, and one left-aligned div for the label. Each time.
+
+        if (not toulouse_way_of_life):
+            for body_elmt in interesting_divs:
+                if (self.is_right_aligned_div(body_elmt)):
+                    # this is the second line of an operation entry, displaying the amount
+                    data = self.extract_text(body_elmt).replace(',', '.').replace(' ', '')
+                    matches = re.findall('^(-?[0-9]+\.[0-9]{2}).*$', data)
+                    operation.amount = float(matches[0]) if (matches) else 0.0
+                    yield operation
+                else:
+                    # this is the first line of an operation entry, displaying the date and label
+                    data = self.extract_text(body_elmt)
+                    matches = re.findall('^([012][0-9]|3[01])/(0[1-9]|1[012]).(.+)$', data)
+                    operation = Operation(index)
+                    index += 1
+                    if (matches):
+                        operation.date  = u'%s/%s' % (matches[0][0], matches[0][1])
+                        operation.label = u'%s'    % matches[0][2]
+                    else:
+                        operation.date  = u'01/01'
+                        operation.label = u'Unknown'
+        else:
+            for i in range(0, len(interesting_divs)/3):
+                operation = Operation(index)
+                index += 1
+                # amount
+                data = self.extract_text(interesting_divs[(i*3)+1]).replace(',', '.').replace(' ', '')
+                matches = re.findall('^(-?[0-9]+\.[0-9]{2}).*$', data)
+                operation.amount = float(matches[0]) if (matches) else 0.0
+                # date
+                data = self.extract_text(interesting_divs[i*3])
+                matches = re.findall('^([012][0-9]|3[01])/(0[1-9]|1[012])', data)
+                operation.date = u'%s/%s' % (matches[0][0], matches[0][1]) if (matches) else u'01/01'
+                #label
+                data = self.extract_text(interesting_divs[(i*3)+2])
+                data = re.sub(' +', ' ', data)
+                operation.label = u'%s' % data
+                yield operation
-- 
1.7.1




More information about the weboob mailing list