[weboob] [PATCH v3 3/3] [piratebay] Ported to browser2

Matthieu Weber mweber+weboob at free.fr
Sat Mar 19 15:55:13 CET 2016


Signed-off-by: Matthieu Weber <mweber+weboob at free.fr>
---
 modules/piratebay/browser.py        |  48 ++++---------
 modules/piratebay/module.py         |   2 +-
 modules/piratebay/pages/index.py    |   4 +-
 modules/piratebay/pages/torrents.py | 132 ++++++++++++------------------------
 4 files changed, 61 insertions(+), 125 deletions(-)

diff --git a/modules/piratebay/browser.py b/modules/piratebay/browser.py
index 3f86015..f678e11 100644
--- a/modules/piratebay/browser.py
+++ b/modules/piratebay/browser.py
@@ -18,48 +18,30 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.
 
 
-import random
-import urllib
-from urlparse import urlsplit
-
-from weboob.deprecated.browser import Browser, BrowserHTTPNotFound
+from weboob.browser import PagesBrowser, URL
 
 from .pages.index import IndexPage
-from .pages.torrents import TorrentPage, TorrentsPage
+from .pages.torrents import TorrentPage, TorrentsPage, FilesPage
 
 __all__ = ['PiratebayBrowser']
 
 
-class PiratebayBrowser(Browser):
-    ENCODING = 'utf-8'
-    DOMAINS = ['thepiratebay.se']
+class PiratebayBrowser(PagesBrowser):
+    BASEURL = 'https://thepiratebay.se'
 
-    def __init__(self, url, *args, **kwargs):
-        url = url or 'https://%s/' % random.choice(self.DOMAINS)
-        url_parsed = urlsplit(url)
-        self.PROTOCOL = url_parsed.scheme
-        self.DOMAIN = url_parsed.netloc
-        self.PAGES = {
-            '%s://%s/' % (self.PROTOCOL, self.DOMAIN): IndexPage,
-            '%s://%s/search/.*/0/7/0' % (self.PROTOCOL, self.DOMAIN): TorrentsPage,
-            '%s://%s/torrent/.*' % (self.PROTOCOL, self.DOMAIN): TorrentPage
-        }
-        Browser.__init__(self, *args, **kwargs)
+    index_page = URL('/$', IndexPage)
+    torrents_page = URL('/search/(?P<query>.+)/0/7/0', TorrentsPage)
+    torrent_page = URL('/torrent/(?P<id>.+)', TorrentPage)
+    files_page = URL('/ajax_details_filelist.php\?id=(?P<id>.+)', FilesPage)
 
     def iter_torrents(self, pattern):
-        self.location('%s://%s/search/%s/0/7/0' % (self.PROTOCOL,
-                                                   self.DOMAIN,
-                                                   urllib.quote_plus(pattern.encode('utf-8'))))
-
-        assert self.is_on_page(TorrentsPage)
+        self.torrents_page.go(query=pattern)
         return self.page.iter_torrents()
 
     def get_torrent(self, _id):
-        try:
-            self.location('%s://%s/torrent/%s/' % (self.PROTOCOL,
-                                                   self.DOMAIN,
-                                                   _id))
-        except BrowserHTTPNotFound:
-            return
-        if self.is_on_page(TorrentPage):
-            return self.page.get_torrent(_id)
+        self.torrent_page.go(id=_id)
+        torrent = self.page.get_torrent()
+        self.files_page.go(id=_id)
+        files = self.page.get_files()
+        torrent.files = files
+        return torrent
diff --git a/modules/piratebay/module.py b/modules/piratebay/module.py
index 6407f07..377e0c0 100644
--- a/modules/piratebay/module.py
+++ b/modules/piratebay/module.py
@@ -51,7 +51,7 @@ class PiratebayModule(Module, CapTorrent):
 
         if torrent.url is NotAvailable and torrent.magnet:
             raise MagnetOnly(torrent.magnet)
-        return self.browser.openurl(torrent.url.encode('utf-8')).read()
+        return self.browser.open(torrent.url).content
 
     def iter_torrents(self, pattern):
         return self.browser.iter_torrents(pattern.replace(' ', '+'))
diff --git a/modules/piratebay/pages/index.py b/modules/piratebay/pages/index.py
index 688b7b5..83d6c24 100644
--- a/modules/piratebay/pages/index.py
+++ b/modules/piratebay/pages/index.py
@@ -18,9 +18,9 @@
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.
 
 
-from weboob.deprecated.browser import Page
+from weboob.browser.pages import HTMLPage
 
 
-class IndexPage(Page):
+class IndexPage(HTMLPage):
     def is_logged(self):
         return 'id' in self.document.find('body').attrib
diff --git a/modules/piratebay/pages/torrents.py b/modules/piratebay/pages/torrents.py
index df7b7c4..06170c9 100644
--- a/modules/piratebay/pages/torrents.py
+++ b/modules/piratebay/pages/torrents.py
@@ -17,104 +17,58 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with weboob. If not, see <http://www.gnu.org/licenses/>.
 
-
-from weboob.deprecated.browser import Page,BrokenPageError
+from weboob.tools.misc import get_bytes_size
+from weboob.browser.pages import HTMLPage
+from weboob.browser.elements import ItemElement, ListElement, method
 from weboob.capabilities.torrent import Torrent
-from weboob.capabilities.base import NotAvailable, NotLoaded
+from weboob.capabilities.base import NotAvailable
+from weboob.browser.filters.standard import RawText, CleanText, Regexp, Date, Type
+
 
-from html2text import unescape
+class TorrentsPage(HTMLPage):
+    @method
+    class iter_torrents(ListElement):
+        item_xpath = '//table[@id="searchResult"]/tr'
 
+        class item(ItemElement):
+            klass = Torrent
 
-class TorrentsPage(Page):
-    def unit(self, n, u):
-        m = {'B': 1,
-                'KB': 1024,
-                'MB': 1024 * 1024,
-                'GB': 1024 * 1024 * 1024,
-                'TB': 1024 * 1024 * 1024 * 1024,
-                }
-        return float(n * m[u])
+            obj_id = Regexp(CleanText('./td[2]/div/a[@class="detLink"]/@href'),
+                            r'^/torrent/(\d+)/', '\\1')
+            obj_name = Regexp(CleanText('./td[2]/div/a[@class="detLink"]/@title'),
+                              r'Details for (.*)$', '\\1')
+            obj_magnet = CleanText('./td[2]/a[title="Download this torrent using magnet"]/@href')
+            obj_date = Date(Regexp(CleanText('./td[2]/font'), r'Uploaded (\d{2}-\d{2} \d{4})', '\\1'))
+            obj_seeders = Type(CleanText('./td[3]'), type=int)
+            obj_leechers = Type(CleanText('./td[4]'), type=int)
 
-    def iter_torrents(self):
-        try:
-            table = self.parser.select(self.document.getroot(), 'table#searchResult', 1)
-        except BrokenPageError:
-            return
-        first = True
-        for tr in table.getiterator('tr'):
-            if first:
-                first = False
-                continue
-            if tr.get('class', '') != "header":
-                td = tr.getchildren()[1]
-                div = td.getchildren()[0]
-                link = div.find('a').attrib['href']
-                title = unicode(unescape(div.find('a').text))
-                idt = link.split('/')[2]
+            def obj_size(self):
+                value, unit = Regexp(CleanText('./td[2]/font'), r'Size ([\d\.]+ [^,]+),', '\\1')(self).split(' ')
+                return get_bytes_size(float(value), unit)
 
-                a = td.getchildren()[1]
-                url = unicode(a.attrib['href'])
 
-                size = td.find('font').text.split(',')[1].strip()
-                u = size.split(' ')[1].split(u'\xa0')[1].replace('i', '')
-                size = size.split(' ')[1].split(u'\xa0')[0]
+class TorrentPage(HTMLPage):
+    @method
+    class get_torrent(ItemElement):
+        klass = Torrent
 
-                seed = tr.getchildren()[2].text
-                leech = tr.getchildren()[3].text
+        def obj_id(self):
+            return self.page.url.split('/')[-1]
 
-                torrent = Torrent(idt, title)
-                torrent.url = url
-                torrent.size = self.unit(float(size), u)
-                torrent.seeders = int(seed)
-                torrent.leechers = int(leech)
-                torrent.description = NotLoaded
-                torrent.files = NotLoaded
-                torrent.magnet = NotLoaded
-                yield torrent
+        def obj_url(self):
+            return NotAvailable
 
+        obj_name = CleanText('//div[@id="title"]')
+        obj_magnet = CleanText('//div[@class="download"]/a[starts-with(@href, "magnet:")]/@href')
+        obj_date = Date(CleanText('//div[@id="details"]//dt[.="Uploaded:"]/following-sibling::dd[1]'))
+        obj_size = Type(Regexp(CleanText('//div[@id="details"]//dt[.="Size:"]/following-sibling::dd[1]'),
+                        r'\((\d+) Bytes\)', '\\1'), type=float)
+        obj_seeders = Type(CleanText('//div[@id="details"]//dt[.="Seeders:"]/following-sibling::dd[1]'), type=int)
+        obj_leechers = Type(CleanText('//div[@id="details"]//dt[.="Leechers:"]/following-sibling::dd[1]'), type=int)
+        obj_description = RawText('//div[@class="nfo"]/pre', children=True)
 
-class TorrentPage(Page):
-    def get_torrent(self, id):
-        url = NotAvailable
-        magnet = NotAvailable
-        for div in self.document.getiterator('div'):
-            if div.attrib.get('id', '') == 'title':
-                title = unicode(unescape(div.text.strip()))
-            elif div.attrib.get('class', '') == 'download':
-                for link in self.parser.select(div, 'a'):
-                    href = link.attrib.get('href', '')
-                    # https fails on the download server, so strip it
-                    if href.startswith('https://'):
-                        href = href.replace('https://', 'http://', 1)
-                    if href.startswith('magnet:'):
-                        magnet = unicode(href)
-                    elif len(href):
-                        url = unicode(href)
-            elif div.attrib.get('id', '') == 'details':
-                size = float(div.getchildren()[0].getchildren()[5].text.split('(')[1].split('Bytes')[0])
-                if len(div.getchildren()) > 1 \
-                and div.getchildren()[1].attrib.get('class', '') == 'col2':
-                    child_to_explore = div.getchildren()[1]
-                else:
-                    child_to_explore = div.getchildren()[0]
-                prev_child_txt = "none"
-                seed = "-1"
-                leech = "-1"
-                for ch in child_to_explore.getchildren():
-                    if prev_child_txt == "Seeders:":
-                        seed = ch.text
-                    if prev_child_txt == "Leechers:":
-                        leech = ch.text
-                    prev_child_txt = ch.text
-            elif div.attrib.get('class', '') == 'nfo':
-                description = unicode(div.getchildren()[0].text_content().strip())
-        torrent = Torrent(id, title)
-        torrent.url = url or NotAvailable
-        torrent.magnet = magnet
-        torrent.size = size
-        torrent.seeders = int(seed)
-        torrent.leechers = int(leech)
-        torrent.description = description
-        torrent.files = NotAvailable
 
-        return torrent
+class FilesPage(HTMLPage):
+    def get_files(self):
+        return [" ".join([td.text for td in tr.xpath('./td')])
+                for tr in self.doc.xpath('//table/tr')]
-- 
2.1.4




More information about the weboob mailing list