[weboob] [PATCH v3 2/3] RawText optionally retrieves text from sub-elements

Matthieu Weber mweber+weboob at free.fr
Sat Mar 19 15:55:12 CET 2016


Signed-off-by: Matthieu Weber <mweber+weboob at free.fr>
---
 weboob/browser/filters/standard.py | 17 ++++++++++---
 weboob/browser/tests/filters.py    | 50 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 3 deletions(-)
 create mode 100644 weboob/browser/tests/filters.py

diff --git a/weboob/browser/filters/standard.py b/weboob/browser/filters/standard.py
index 3486bd6..3450e9f 100644
--- a/weboob/browser/filters/standard.py
+++ b/weboob/browser/filters/standard.py
@@ -309,15 +309,26 @@ class TableCell(_Filter):
 
 
 class RawText(Filter):
+    def __init__(self, selector=None, children=False, default=_NO_DEFAULT):
+        super(RawText, self).__init__(selector, default=default)
+        self.children = children
+
     @debug()
     def filter(self, el):
         if isinstance(el, (tuple, list)):
             return u' '.join([self.filter(e) for e in el])
 
-        if el.text is None:
-            return self.default
+        if self.children:
+            text = el.text_content()
         else:
-            return unicode(el.text)
+            text = el.text
+
+        if text is None:
+            result = self.default
+        else:
+            result = unicode(text)
+
+        return result
 
 
 class CleanText(Filter):
diff --git a/weboob/browser/tests/filters.py b/weboob/browser/tests/filters.py
new file mode 100644
index 0000000..483b130
--- /dev/null
+++ b/weboob/browser/tests/filters.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+# Copyright(C) 2016 Matthieu Weber
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+from unittest import TestCase
+from lxml.html import fromstring
+
+from weboob.browser.filters.standard import RawText
+
+
+class RawTextTest(TestCase):
+    # Original RawText behaviour:
+    # - the content of <p> is empty, we return the default value
+    def test_first_node_is_element(self):
+        e = fromstring('<html><body><p></p></body></html>')
+        self.assertEqual("foo", RawText('//p', default="foo")(e))
+
+    # - the content of <p> starts with text, we retrieve only that text
+    def test_first_node_is_text(self):
+        e = fromstring('<html><body><p>blah: <span>229,90</span> EUR</p></body></html>')
+        self.assertEqual("blah: ", RawText('//p', default="foo")(e))
+
+    # - the content of <p> starts with a sub-element, we retrieve the default value
+    def test_first_node_is_element(self):
+        e = fromstring('<html><body><p><span>229,90</span> EUR</p></body></html>')
+        self.assertEqual("foo", RawText('//p', default="foo")(e))
+
+    # Recursive RawText behaviour
+    # - the content of <p> starts with text, we retrieve all text, also the text from sub-elements
+    def test_first_node_is_text_recursive(self):
+        e = fromstring('<html><body><p>blah: <span>229,90</span> EUR</p></body></html>')
+        self.assertEqual("blah: 229,90 EUR", RawText('//p', default="foo", children=True)(e))
+
+    # - the content of <p> starts with a sub-element, we retrieve all text, also the text from sub-elements
+    def test_first_node_is_element_recursive(self):
+        e = fromstring('<html><body><p><span>229,90</span> EUR</p></body></html>')
+        self.assertEqual("229,90 EUR", RawText('//p', default="foo", children=True)(e))
-- 
2.1.4




More information about the weboob mailing list