[weboob] [PATCH 1/1] Add module for Europarl videos

François Revol revol at free.fr
Fri Aug 31 00:04:27 CEST 2012


We currently support committees and other events.
TODO: support plenary sessions
TODO: latest and search

Signed-off-by: François Revol <revol at free.fr>
---
 modules/europarl/__init__.py          |    3 +
 modules/europarl/backend.py           |   82 ++++++++++++++++++++++
 modules/europarl/browser.py           |   57 ++++++++++++++++
 modules/europarl/favicon.png          |  Bin 0 -> 526 bytes
 modules/europarl/favicon_europarl.xcf |  Bin 0 -> 2340 bytes
 modules/europarl/pages.py             |  120 +++++++++++++++++++++++++++++++++
 modules/europarl/test.py              |   42 ++++++++++++
 modules/europarl/video.py             |   50 ++++++++++++++
 8 files changed, 354 insertions(+)
 create mode 100644 modules/europarl/__init__.py
 create mode 100644 modules/europarl/backend.py
 create mode 100644 modules/europarl/browser.py
 create mode 100644 modules/europarl/favicon.png
 create mode 100644 modules/europarl/favicon_europarl.xcf
 create mode 100644 modules/europarl/pages.py
 create mode 100644 modules/europarl/test.py
 create mode 100644 modules/europarl/video.py

diff --git a/modules/europarl/__init__.py b/modules/europarl/__init__.py
new file mode 100644
index 0000000..0994ffe
--- /dev/null
+++ b/modules/europarl/__init__.py
@@ -0,0 +1,3 @@
+from .backend import EuroparlBackend
+
+__all__ = ['EuroparlBackend']
diff --git a/modules/europarl/backend.py b/modules/europarl/backend.py
new file mode 100644
index 0000000..971c7d6
--- /dev/null
+++ b/modules/europarl/backend.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2010-2011 Romain Bignon
+# Copyright(C) 2012 François Revol
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+from __future__ import with_statement
+
+from weboob.capabilities.video import ICapVideo, BaseVideo
+from weboob.tools.backend import BaseBackend
+from weboob.capabilities.collection import ICapCollection, CollectionNotFound
+
+from .browser import EuroparlBrowser
+from .video import EuroparlVideo
+
+
+__all__ = ['EuroparlBackend']
+
+
+class EuroparlBackend(BaseBackend, ICapVideo, ICapCollection):
+    NAME = 'europarl'
+    MAINTAINER = u'François Revol'
+    EMAIL = 'revol at free.fr'
+    VERSION = '0.d'
+    DESCRIPTION = 'Europarl parliamentary video streaming website'
+    LICENSE = 'AGPLv3+'
+    BROWSER = EuroparlBrowser
+
+    def get_video(self, _id):
+        with self.browser:
+            return self.browser.get_video(_id)
+
+    SORTBY = ['relevance', 'rating', 'views', 'time']
+
+    # def search_videos(self, pattern, sortby=ICapVideo.SEARCH_RELEVANCE, nsfw=False, max_results=None):
+    #     with self.browser:
+    #         return self.browser.search_videos(pattern, self.SORTBY[sortby])
+
+    def fill_video(self, video, fields):
+        if fields != ['thumbnail']:
+            # if we don't want only the thumbnail, we probably want also every fields
+            with self.browser:
+                video = self.browser.get_video(EuroparlVideo.id2url(video.id), video)
+        if 'thumbnail' in fields and video.thumbnail:
+            with self.browser:
+                video.thumbnail.data = self.browser.readurl(video.thumbnail.url)
+
+        return video
+
+    def iter_resources(self, objs, split_path):
+        if BaseVideo in objs:
+            collection = self.get_collection(objs, split_path)
+            if collection.path_level == 0:
+                yield self.get_collection(objs, [u'latest'])
+            if collection.split_path == [u'latest']:
+                for video in self.browser.latest_videos():
+                    yield video
+
+    def validate_collection(self, objs, collection):
+        if collection.path_level == 0:
+            return
+        if BaseVideo in objs and collection.split_path == [u'latest']:
+            collection.title = u'Latest Europarl videos'
+            return
+        raise CollectionNotFound(collection.split_path)
+
+    OBJECTS = {EuroparlVideo: fill_video}
diff --git a/modules/europarl/browser.py b/modules/europarl/browser.py
new file mode 100644
index 0000000..734bd21
--- /dev/null
+++ b/modules/europarl/browser.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2010-2011 Romain Bignon
+# Copyright(C) 2012 François Revol
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+from weboob.tools.browser import BaseBrowser
+from weboob.tools.browser.decorators import id2url
+
+#from .pages.index import IndexPage
+from .pages import VideoPage
+from .video import EuroparlVideo
+
+
+__all__ = ['EuroparlBrowser']
+
+
+class EuroparlBrowser(BaseBrowser):
+    DOMAIN = 'europarl.europa.eu'
+    ENCODING = None
+    PAGES = {r'http://[w\.]*europarl\.europa\.eu/ep-live/(?P<lang>\w+)/committees/video\?.*event=(?P<id>[^&]+).*': VideoPage,
+             r'http://[w\.]*europarl\.europa\.eu/ep-live/(?P<lang>\w+)/other-events/video\?.*event=(?P<id>[^&]+).*': VideoPage
+#TODO:plenaries
+#            r'http://[w\.]*europarl\.europa\.eu/ep-live/(?P<lang>\w+)/plenary/video\?.*date=(?P<id>[^&]+).*': VideoPage
+#            r'http://[w\.]*europarl\.europa\.eu/ep-live/(?P<lang>\w+)/plenary/video\?.*debate=(?P<id>[^&]+).*': VideoPage
+            }
+
+    @id2url(EuroparlVideo.id2url)
+    def get_video(self, url, video=None):
+        self.location(url)
+        return self.page.get_video(video)
+
+    # def search_videos(self, pattern, sortby):
+    #     return None
+    #     self.location(self.buildurl('http://europarltv.europa.eu/en/search%s' % sortby, query=pattern.encode('utf-8')))
+    #     assert self.is_on_page(IndexPage)
+    #     return self.page.iter_videos()
+
+    # def latest_videos(self):
+    #     self.home()
+    #     assert self.is_on_page(IndexPage)
+    #     return self.page.iter_videos()
diff --git a/modules/europarl/favicon.png b/modules/europarl/favicon.png
new file mode 100644
index 0000000000000000000000000000000000000000..fffc442d867908208495c1b977b2e0ce2c9b439a
GIT binary patch
literal 526
zcmeAS at N?(olHy`uVBq!ia0vp^4j|0I1|(Ny7TyC=Y)RhkE)4%caKYZ?lYt_f1s;*b
z3=G`DAk4 at xYmNj^kiEpy*OmPahny%kmqC!vb)b-BiEBiObAE1aYF-J0b5UwyNotBh
zd1gt5g1e`0KzJjcI0FM?nWu|mNX4zUvo7)-R^V{G-BzEK^y_DD{ni|R?{63UHU?<K
z$WK4}a%v3Ui#_Ysf1gsJ6a0yhQGtP_fkEIv)|I^qOouoQ*q(j=y}Ew6sO}TRH}&?~
z(R)j6H?wdRn-{1Y+J8Tj%VnPb=e~@cPZ-m=HKIi$=Y0F4CReb<@^s0Li2QqpezYks
zc=xnm<E`Z at E$y|r=WtG7YT5DNlgOH76FYSe9%TV?k}7}QDZMz^-(Sf}K9<+3L8$l7
z%y8w9(@gUYJpK4|gZY|yg~lozq6<9 at dKvCb443NH<o;APb3yr&s|S+1$`;-6;7f0c
zv00{b;`UTlCFOi6{s(SG0TL<_HmpiaDy$#oh`DD=bV+=hBel$$yF#yL-4%nE`fju2
z3@%k&`_N^O-?HYUzb$W8$&{$#xRr at V^xWL`EGoTQ*Z#lb8k6$+{ZX?oybGE+al!Ow
z*}vK+J!0CjXHm{n*=u?CI4Tv)ryLG+IN|+SRM*hN1rlluOiyIxPb7VgD*F(B7Z^<p
Mp00i_>zopr0Q9ia9{>OV

literal 0
HcmV?d00001

diff --git a/modules/europarl/favicon_europarl.xcf b/modules/europarl/favicon_europarl.xcf
new file mode 100644
index 0000000000000000000000000000000000000000..ea7301492c80a8b11c17e9e7ff19711342973303
GIT binary patch
literal 2340
zcma);O>P at U5QWL%@XxUmQIxWhfT1h_G9=P8AbTRi28d)4WMLy$ps67}0DlybPVAMg
z;DwhtKym}`bBXM7kj^;yUa^Ov7zIcR)%E&yRZUkvjYgw%av8lH45MK>P6~{kH{k|W
zEHX8*+Sl*HpJm`3EW}FKmmVu_m-q?(-iWg8!E7=~rVG{uZEk;amGl?MMf6j;cpZKF
z-HY$Fvt^;_q(4g9qpS2n9MQ$X at qF=aoJ8~4IK6mq=Xf|8%*L~;=-6EQYV at ifbvs=f
zw}3}}>j9~+Zb{AeqyGj<eLKJG57Ox<dXzs4^4)Eu=h|<k=^{#}gV)Ji`Lp40o-9 at e
zRQA8AC37=X_fQ%2CzHOvk-|>5bM}~@^Yinsx at TwSPvR$!wdY6i_a&&=g at b$EboY0M
z{U6i8Y}&b;j#w4~O^gaAA^*Z^9xJ}?F|M$;hVQK5&(`qWHGFRke*sqAd{hB?o<{yl
zl%EAE_+UH#oSoR<mz!MhQS0mCAMvLwey*vjsU8PeP2fmVHLhl5fkUIitb#u<GR#7J
z6o*+AD`N+S2`(xl^vpJXUqqnEwi+w6C-jV(hXPFswgh%8H^}58$OgWJHCKsx?o+e8
z=u7QcAFB6Is~V}N0(BzWu<|F675ph?joOh;Slz65UHG`FxGSV9FrfY$Ebf!j0P8I3
zvZAPMP3nD<f%eVowgRhQ?He0VCsb+~pu$#6=RCp`pW_k%CFApoTA4^zsl#fclVCqK
zot^H-NoOaW9ql{m?4+}!gj18yp+jd!pOB$LXGf0?oo$WIV1jv`Q?EpZj(#0FJL&AV
z(Agszb<)|Pvy;xQC4j!<22LKG9XhiSI&}7ldM at 2yIfKr=57k3wC!HNStEteTvy(?>
zx(L|_myOzK(Rrhu>Fg^a at A3)^sDE^J$bnzpfx1K*C!LqQZ!*yKB=v%BR>9gg7M;0I
zV$j(NTQQvzoh?4c=<HAyKCh^iiDZ?~d9#zH&e^9pd#<Ugsh$OKP2fmVm6x?FaA<TG
zSMUc$>UXvpNA^~S1cmT=!wD{G()%uE8`~EVthO2}v?uh8nuh{S3bq7xEH{YdBgh86
zg*8`+dhSy*!tHr4wP$^(-b1Zwq at D`YiEP8lpFmddr<gTr*(a=S)<fr2#a$s?fdTd3
zU~!+E23TiNmlZ{AYf|r<475E-y_CsT1#92ffI6X4%K#O&VmjxMkI!+5fRgcfMXgLE
ztJGn&(MhnMU2oa9rpKRf+3t(}{D(J5bis%Iw_nG}V3E$I at T-qR{(-uN`#&G3&wZ{w
hVq2L0Lt*6K8qt2#{si>A!Pgb8(OX(Q!WVSjzX5~j0R{j7

literal 0
HcmV?d00001

diff --git a/modules/europarl/pages.py b/modules/europarl/pages.py
new file mode 100644
index 0000000..19eaf33
--- /dev/null
+++ b/modules/europarl/pages.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2010-2011 Romain Bignon
+# Copyright(C) 2012 François Revol
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+from weboob.tools.mech import ClientForm
+ControlNotFoundError = ClientForm.ControlNotFoundError
+
+from weboob.tools.browser import BasePage
+
+import re
+import datetime
+
+from weboob.capabilities.base import NotAvailable
+from weboob.tools.browser import BrokenPageError
+
+from .video import EuroparlVideo
+
+
+
+__all__ = ['VideoPage']
+
+class VideoPage(BasePage):
+    def get_video(self, video=None):
+        if video is None:
+            video = EuroparlVideo(self.group_dict['id'])
+        video.title = unicode(self.get_title())
+        video.url = unicode(self.get_url())
+        self.set_details(video)
+
+        video.set_empty_fields(NotAvailable)
+        return video
+
+    def get_url(self):
+        # search for <input id="codeUrl">
+        # TODO: plenaries can be downloaded as mp4...
+        obj = self.parser.select(self.document.getroot(), 'input#codeUrl', 1)
+        if obj is None:
+            return None
+        return obj.attrib['value']
+
+    def get_title(self):
+        obj = self.parser.select(self.document.getroot(), 'h1#player_subjectTitle')
+        if len(obj) < 1:
+            obj = self.parser.select(self.document.getroot(), 'title')
+            if len(obj) < 1:
+                return None
+        title = obj[0].text.strip()
+        obj = self.parser.select(self.document.getroot(), 'span.ep_subtitle')
+        if len(obj) < 1:
+            return title
+
+        for span in self.parser.select(obj[0], 'span.ep_acronym, span.ep_theme'):
+            if span.text_content():
+                title += ' ' + span.text_content().strip()
+        
+        return title
+
+    def set_details(self, v):
+        v.author = u'European Parliament'
+        obj = self.parser.select(self.document.getroot(), 'meta[name=available]', 1)
+        if obj is not None:
+            value = obj.attrib['content']
+            print value
+            m = re.match('(\d\d)-(\d\d)-(\d\d\d\d)\s*(\d\d):(\d\d)', value)
+            if not m:
+                raise BrokenPageError('Unable to parse datetime: %r' % value)
+            day = m.group(1)
+            month = m.group(2)
+            year = m.group(3)
+            hour = m.group(4)
+            minute = m.group(5)
+            v.date = datetime.datetime(year=int(year),
+                                       month=int(month),
+                                       day=int(day),
+                                       hour=int(hour),
+                                       minute=int(minute))
+            
+        obj = self.parser.select(self.document.getroot(), 'span.ep_subtitle', 1)
+        if obj is not None:
+            span = self.parser.select(obj, 'span.ep_date', 1)
+            value = span.text
+            m = re.match('(\d\d):(\d\d)\s*\/\s*(\d\d):(\d\d)\s*-\s*(\d\d)-(\d\d)-(\d\d\d\d)', value)
+            if not m:
+                raise BrokenPageError('Unable to parse datetime: %r' % value)
+            bhour = m.group(1)
+            bminute = m.group(2)
+            ehour = m.group(3)
+            eminute = m.group(4)
+            day = m.group(5)
+            month = m.group(6)
+            year = m.group(7)
+            
+            start = datetime.datetime(year=int(year),
+                                      month=int(month),
+                                      day=int(day),
+                                      hour=int(bhour),
+                                      minute=int(bminute))
+            end = datetime.datetime(year=int(year),
+                                    month=int(month),
+                                    day=int(day),
+                                    hour=int(ehour),
+                                    minute=int(eminute))
+
+            v.duration = end - start
diff --git a/modules/europarl/test.py b/modules/europarl/test.py
new file mode 100644
index 0000000..aa0c6de
--- /dev/null
+++ b/modules/europarl/test.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2010-2011 Romain Bignon
+# Copyright(C) 2012 François Revol
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+from weboob.tools.test import BackendTest
+#from weboob.capabilities.video import BaseVideo
+
+
+class EuroparlTest(BackendTest):
+    BACKEND = 'europarl'
+
+    # def test_search(self):
+    #     l = list(self.backend.search_videos('neelie kroes'))
+    #     self.assertTrue(len(l) > 0)
+    #     v = l[0]
+    #     self.backend.fillobj(v, ('url',))
+    #     self.assertTrue(v.url and v.url.startswith('http://'), 'URL for video "%s" not found: %s' % (v.id, v.url))
+    #     self.backend.browser.openurl(v.url)
+
+    # def test_latest(self):
+    #     l = list(self.backend.iter_resources([BaseVideo], [u'latest']))
+    #     self.assertTrue(len(l) > 0)
+    #     v = l[0]
+    #     self.backend.fillobj(v, ('url',))
+    #     self.assertTrue(v.url and v.url.startswith('http://'), 'URL for video "%s" not found: %s' % (v.id, v.url))
diff --git a/modules/europarl/video.py b/modules/europarl/video.py
new file mode 100644
index 0000000..c1cc390
--- /dev/null
+++ b/modules/europarl/video.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+
+# Copyright(C) 2010-2011 Roger Philibert
+#
+# This file is part of weboob.
+#
+# weboob is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# weboob is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with weboob. If not, see <http://www.gnu.org/licenses/>.
+
+
+from weboob.capabilities.video import BaseVideo
+
+import re
+
+__all__ = ['EuroparlVideo']
+
+
+class EuroparlVideo(BaseVideo):
+    def __init__(self, *args, **kwargs):
+        BaseVideo.__init__(self, *args, **kwargs)
+        self.ext = u'wmv'
+
+    @classmethod
+    def id2url(cls, _id):
+        m = re.match('.*-COMMITTEE-.*', _id)
+        if m:
+            return u'http://www.europarl.europa.eu/ep-live/en/committees/video?event=%s&format=wmv' % _id
+        m = re.match('.*-SPECIAL-.*', _id)
+        if m:
+            return u'http://www.europarl.europa.eu/ep-live/en/other-events/video?event=%s&format=wmv' % _id
+        # XXX: not yet supported
+        m = re.match('\d\d-\d\d-\d\d\d\d', _id)
+        if m:
+            return u'http://www.europarl.europa.eu/ep-live/en/plenary/video?date=%s' % _id
+        # XXX: not yet supported
+        m = re.match('\d+', _id)
+        if m:
+            return u'http://www.europarl.europa.eu/ep-live/en/plenary/video?debate=%s' % _id
+        return None
+
-- 
1.7.10.4




More information about the weboob mailing list