Bienvenido! Aquí usted puede subir sus pastes (códigos) para posteriormente compartirlos, inclusive puede protegerlos con password. Siéntase libre de explorar :)

Subido por basepy el July Tue 7th 2:07 AM - Nunca expira
Descargar | Nuevo Paste

  1. #!/usr/bin/env python2
  2. # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
  3. from __future__ import (unicode_literals, division, absolute_import,
  4.                         print_function)
  5.  
  6. __license__   = 'GPL v3'
  7. __copyright__ = '2011, Kovid Goyal <[email protected]>'
  8. __docformat__ = 'restructuredtext en'
  9.  
  10. import re, threading
  11. from future_builtins import map
  12.  
  13. from calibre import browser, random_user_agent
  14. from calibre.customize import Plugin
  15. from calibre.utils.icu import capitalize, lower, upper
  16. from calibre.ebooks.metadata import check_isbn
  17. from calibre.utils.localization import canonicalize_lang, get_lang
  18.  
  19. def create_log(ostream=None):
  20.     from calibre.utils.logging import ThreadSafeLog, FileStream
  21.     log = ThreadSafeLog(level=ThreadSafeLog.DEBUG)
  22.     log.outputs = [FileStream(ostream)]
  23.     return log
  24.  
  25. # Comparing Metadata objects for relevance {{{
  26. words = ("the", "a", "an", "of", "and")
  27. prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
  28. trailing_paren_pat = re.compile(r'\(.*\)$')
  29. whitespace_pat = re.compile(r'\s+')
  30.  
  31. def cleanup_title(s):
  32.     if not s:
  33.         s = _('Unknown')
  34.     s = s.strip().lower()
  35.     s = prefix_pat.sub(' ', s)
  36.     s = trailing_paren_pat.sub('', s)
  37.     s = whitespace_pat.sub(' ', s)
  38.     return s.strip()
  39.  
  40. class InternalMetadataCompareKeyGen(object):
  41.  
  42.     '''
  43.     Generate a sort key for comparison of the relevance of Metadata objects,
  44.     given a search query. This is used only to compare results from the same
  45.     metadata source, not across different sources.
  46.     The sort key ensures that an ascending order sort is a sort by order of
  47.     decreasing relevance.
  48.     The algorithm is:
  49.         * Prefer results that have at least one identifier the same as for the query
  50.         * Prefer results with a cached cover URL
  51.         * Prefer results with all available fields filled in
  52.         * Prefer results with the same language as the current user interface language
  53.         * Prefer results that are an exact title match to the query
  54.         * Prefer results with longer comments (greater than 10% longer)
  55.         * Use the relevance of the result as reported by the metadata source's search
  56.            engine
  57.     '''
  58.     def __init__(self, mi, source_plugin, title, authors, identifiers):
  59.         same_identifier = 2
  60.         idents = mi.get_identifiers()
  61.         for k, v in identifiers.iteritems():
  62.             if idents.get(k) == v:
  63.                 same_identifier = 1
  64.                 break
  65.         all_fields = 1 if source_plugin.test_fields(mi) is None else 2
  66.         exact_title = 1 if title and \
  67.                 cleanup_title(title) == cleanup_title(mi.title) else 2
  68.         language = 1
  69.         if mi.language:
  70.             mil = canonicalize_lang(mi.language)
  71.             if mil != 'und' and mil != canonicalize_lang(get_lang()):
  72.                 language = 2
  73.         has_cover = 2 if (not source_plugin.cached_cover_url_is_reliable or
  74.                 source_plugin.get_cached_cover_url(mi.identifiers) is None) else 1
  75.         self.base = (same_identifier, has_cover, all_fields, language, exact_title)
  76.         self.comments_len = len(mi.comments.strip() if mi.comments else '')
  77.         self.extra = (getattr(mi, 'source_relevance', 0), )
  78.     def __cmp__(self, other):
  79.         result = cmp(self.base, other.base)
  80.         if result == 0:
  81.             # Now prefer results with the longer comments, within 10%
  82.             cx, cy = self.comments_len, other.comments_len
  83.             t = (cx + cy) / 20
  84.             delta = cy - cx
  85.             if abs(delta) > t:
  86.                 result = delta
  87.             else:
  88.                 result = cmp(self.extra, other.extra)
  89.         return result
  90. # }}}
  91. def get_cached_cover_urls(mi):
  92.     from calibre.customize.ui import metadata_plugins
  93.     plugins = list(metadata_plugins(['identify']))
  94.     for p in plugins:
  95.         url = p.get_cached_cover_url(mi.identifiers)
  96.         if url:
  97.             yield (p, url)
  98. def dump_caches():
  99.     from calibre.customize.ui import metadata_plugins
  100.     return {p.name:p.dump_caches() for p in metadata_plugins(['identify'])}
  101. def load_caches(dump):
  102.     from calibre.customize.ui import metadata_plugins
  103.     plugins = list(metadata_plugins(['identify']))
  104.     for p in plugins:
  105.         cache = dump.get(p.name, None)
  106.         if cache:
  107.             p.load_caches(cache)
  108. def cap_author_token(token):
  109.     lt = lower(token)
  110.     if lt in ('von', 'de', 'el', 'van', 'le'):
  111.         return lt
  112.     # no digits no spez. characters
  113.     if re.match(r'([^\d\W]\.){2,}$', lt, re.UNICODE) is not None:
  114.         # Normalize tokens of the form J.K. to J. K.
  115.         parts = token.split('.')
  116.         return '. '.join(map(capitalize, parts)).strip()
  117.     scots_name = None
  118.     for x in ('mc', 'mac'):
  119.         if (token.lower().startswith(x) and len(token) > len(x) and
  120.                 (
  121.                     token[len(x)] == upper(token[len(x)]) or
  122.                     lt == token
  123.                 )):
  124.             scots_name = len(x)
  125.             break
  126.     ans = capitalize(token)
  127.     if scots_name is not None:
  128.         ans = ans[:scots_name] + upper(ans[scots_name]) + ans[scots_name+1:]
  129.     for x in ('-', "'"):
  130.         idx = ans.find(x)
  131.         if idx > -1 and len(ans) > idx+2:
  132.             ans = ans[:idx+1] + upper(ans[idx+1]) + ans[idx+2:]
  133.     return ans
  134. def fixauthors(authors):
  135.     if not authors:
  136.         return authors
  137.     ans = []
  138.     for x in authors:
  139.         ans.append(' '.join(map(cap_author_token, x.split())))
  140.     return ans
  141. def fixcase(x):
  142.     if x:
  143.         from calibre.utils.titlecase import titlecase
  144.         x = titlecase(x)
  145.     return x
  146. class Option(object):
  147.     __slots__ = ['type', 'default', 'label', 'desc', 'name', 'choices']
  148.     def __init__(self, name, type_, default, label, desc, choices=None):
  149.         '''
  150.         :param name: The name of this option. Must be a valid python identifier
  151.         :param type_: The type of this option, one of ('number', 'string',
  152.                         'bool', 'choices')
  153.         :param default: The default value for this option
  154.         :param label: A short (few words) description of this option
  155.         :param desc: A longer description of this option
  156.         :param choices: A dict of possible values, used only if type='choices'.
  157.         dict is of the form {key:human readable label, ...}
  158.         '''
  159.         self.name, self.type, self.default, self.label, self.desc = (name,
  160.                 type_, default, label, desc)
  161.         if choices and not isinstance(choices, dict):
  162.             choices = dict([(x, x) for x in choices])
  163.         self.choices = choices
  164. class Source(Plugin):
  165.     type = _('Metadata source')
  166.     author = 'Kovid Goyal'
  167.     supported_platforms = ['windows', 'osx', 'linux']
  168.     #: Set of capabilities supported by this plugin.
  169.     #: Useful capabilities are: 'identify', 'cover'
  170.     capabilities = frozenset()
  171.     #: List of metadata fields that can potentially be download by this plugin
  172.     #: during the identify phase
  173.     touched_fields = frozenset()
  174.     #: Set this to True if your plugin returns HTML formatted comments
  175.     has_html_comments = False
  176.     #: Setting this to True means that the browser object will add
  177.     #: Accept-Encoding: gzip to all requests. This can speedup downloads
  178.     #: but make sure that the source actually supports gzip transfer encoding
  179.     #: correctly first
  180.     supports_gzip_transfer_encoding = False
  181.     #: Cached cover URLs can sometimes be unreliable (i.e. the download could
  182.     #: fail or the returned image could be bogus. If that is often the case
  183.     #: with this source set to False
  184.     cached_cover_url_is_reliable = True
  185.     #: A list of :class:`Option` objects. They will be used to automatically
  186.     #: construct the configuration widget for this plugin
  187.     options = ()
  188.     #: A string that is displayed at the top of the config widget for this
  189.     #: plugin
  190.     config_help_message = None
  191.     #: If True this source can return multiple covers for a given query
  192.     can_get_multiple_covers = False
  193.     #: If set to True covers downloaded by this plugin are automatically trimmed.
  194.     auto_trim_covers = False
  195.     #: If set to True, and this source returns multiple results for a query,
  196.     #: some of which have ISBNs and some of which do not, the results without
  197.     #: ISBNs will be ignored
  198.     prefer_results_with_isbn = True
  199.     def __init__(self, *args, **kwargs):
  200.         Plugin.__init__(self, *args, **kwargs)
  201.         self.running_a_test = False  # Set to True when using identify_test()
  202.         self._isbn_to_identifier_cache = {}
  203.         self._identifier_to_cover_url_cache = {}
  204.         self.cache_lock = threading.RLock()
  205.         self._config_obj = None
  206.         self._browser = None
  207.         self.prefs.defaults['ignore_fields'] = []
  208.         for opt in self.options:
  209.             self.prefs.defaults[opt.name] = opt.default
  210.     # Configuration {{{
  211.     def is_configured(self):
  212.         '''
  213.         Return False if your plugin needs to be configured before it can be
  214.         used. For example, it might need a username/password/API key.
  215.         '''
  216.         return True
  217.     def is_customizable(self):
  218.         return True
  219.     def customization_help(self):
  220.         return 'This plugin can only be customized using the GUI'
  221.     def config_widget(self):
  222.         from calibre.gui2.metadata.config import ConfigWidget
  223.         return ConfigWidget(self)
  224.     def save_settings(self, config_widget):
  225.         config_widget.commit()
  226.     @property
  227.     def prefs(self):
  228.         if self._config_obj is None:
  229.             from calibre.utils.config import JSONConfig
  230.             self._config_obj = JSONConfig('metadata_sources/%s.json'%self.name)
  231.         return self._config_obj
  232.     # }}}
  233.     # Browser {{{
  234.     @property
  235.     def user_agent(self):
  236.         # Pass in an index to random_user_agent() to test with a particular
  237.         # user agent
  238.         return random_user_agent()
  239.     @property
  240.     def browser(self):
  241.         if self._browser is None:
  242.             self._browser = browser(user_agent=self.user_agent)
  243.             if self.supports_gzip_transfer_encoding:
  244.                 self._browser.set_handle_gzip(True)
  245.         return self._browser.clone_browser()
  246.     # }}}
  247.     # Caching {{{
  248.     def get_related_isbns(self, id_):
  249.         with self.cache_lock:
  250.             for isbn, q in self._isbn_to_identifier_cache.iteritems():
  251.                 if q == id_:
  252.                     yield isbn
  253.     def cache_isbn_to_identifier(self, isbn, identifier):
  254.         with self.cache_lock:
  255.             self._isbn_to_identifier_cache[isbn] = identifier
  256.     def cached_isbn_to_identifier(self, isbn):
  257.         with self.cache_lock:
  258.             return self._isbn_to_identifier_cache.get(isbn, None)
  259.     def cache_identifier_to_cover_url(self, id_, url):
  260.         with self.cache_lock:
  261.             self._identifier_to_cover_url_cache[id_] = url
  262.     def cached_identifier_to_cover_url(self, id_):
  263.         with self.cache_lock:
  264.             return self._identifier_to_cover_url_cache.get(id_, None)
  265.     def dump_caches(self):
  266.         with self.cache_lock:
  267.             return {'isbn_to_identifier':self._isbn_to_identifier_cache.copy(),
  268.                     'identifier_to_cover':self._identifier_to_cover_url_cache.copy()}
  269.     def load_caches(self, dump):
  270.         with self.cache_lock:
  271.             self._isbn_to_identifier_cache.update(dump['isbn_to_identifier'])
  272.             self._identifier_to_cover_url_cache.update(dump['identifier_to_cover'])
  273.     # }}}
  274.     # Utility functions {{{
  275.     def get_author_tokens(self, authors, only_first_author=True):
  276.         '''
  277.         Take a list of authors and return a list of tokens useful for an
  278.         AND search query. This function tries to return tokens in
  279.         first name middle names last name order, by assuming that if a comma is
  280.         in the author name, the name is in lastname, other names form.
  281.         '''
  282.         if authors:
  283.             # Leave ' in there for Irish names
  284.             remove_pat = re.compile(r'[!@#$%^&*(){}`~"\s\[\]/]')
  285.             replace_pat = re.compile(r'[-+.:;,]')
  286.             if only_first_author:
  287.                 authors = authors[:1]
  288.             for au in authors:
  289.                 has_comma = ',' in au
  290.                 au = replace_pat.sub(' ', au)
  291.                 parts = au.split()
  292.                 if has_comma:
  293.                     # au probably in ln, fn form
  294.                     parts = parts[1:] + parts[:1]
  295.                 for tok in parts:
  296.                     tok = remove_pat.sub('', tok).strip()
  297.                     if len(tok) > 2 and tok.lower() not in ('von', 'van',
  298.                             _('Unknown').lower()):
  299.                         yield tok
  300.     def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False):
  301.         '''
  302.         Take a title and return a list of tokens useful for an AND search query.
  303.         Excludes connectives(optionally) and punctuation.
  304.         '''
  305.         if title:
  306.             # strip sub-titles
  307.             if strip_subtitle:
  308.                 subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)')
  309.                 if len(subtitle.sub('', title)) > 1:
  310.                     title = subtitle.sub('', title)
  311.             title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in
  312.             [
  313.                 # Remove things like: (2010) (Omnibus) etc.
  314.                 (r'(?i)[({\[](\d{4}|omnibus|anthology|hardcover|audiobook|audio\scd|paperback|turtleback|mass\s*market|edition|ed\.)[\])}]', ''),
  315.                 # Remove any strings that contain the substring edition inside
  316.                 # parentheses
  317.                 (r'(?i)[({\[].*?(edition|ed.).*?[\]})]', ''),
  318.                 # Remove commas used a separators in numbers
  319.                 (r'(\d+),(\d+)', r'\1\2'),
  320.                 # Remove hyphens only if they have whitespace before them
  321.                 (r'(\s-)', ' '),
  322.                 # Replace other special chars with a space
  323.                 (r'''[:,;!@$%^&*(){}.`~"\s\[\]/]''', ' '),
  324.             ]]
  325.             for pat, repl in title_patterns:
  326.                 title = pat.sub(repl, title)
  327.             tokens = title.split()
  328.             for token in tokens:
  329.                 token = token.strip().strip('"').strip("'")
  330.                 if token and (not strip_joiners or token.lower() not in ('a',
  331.                     'and', 'the', '&')):
  332.                     yield token
  333.     def split_jobs(self, jobs, num):
  334.         'Split a list of jobs into at most num groups, as evenly as possible'
  335.         groups = [[] for i in range(num)]
  336.         jobs = list(jobs)
  337.         while jobs:
  338.             for gr in groups:
  339.                 try:
  340.                     job = jobs.pop()
  341.                 except IndexError:
  342.                     break
  343.                 gr.append(job)
  344.         return [g for g in groups if g]
  345.     def test_fields(self, mi):
  346.         '''
  347.         Return the first field from self.touched_fields that is null on the
  348.         mi object
  349.         '''
  350.         for key in self.touched_fields:
  351.             if key.startswith('identifier:'):
  352.                 key = key.partition(':')[-1]
  353.                 if not mi.has_identifier(key):
  354.                     return 'identifier: ' + key
  355.             elif mi.is_null(key):
  356.                 return key
  357.     def clean_downloaded_metadata(self, mi):
  358.         '''
  359.         Call this method in your plugin's identify method to normalize metadata
  360.         before putting the Metadata object into result_queue. You can of
  361.         course, use a custom algorithm suited to your metadata source.
  362.         '''
  363.         docase = mi.language == 'eng' or mi.is_null('language')
  364.         if docase and mi.title:
  365.             mi.title = fixcase(mi.title)
  366.         mi.authors = fixauthors(mi.authors)
  367.         if mi.tags and docase:
  368.             mi.tags = list(map(fixcase, mi.tags))
  369.         mi.isbn = check_isbn(mi.isbn)
  370.     def download_multiple_covers(self, title, authors, urls, get_best_cover, timeout, result_queue, abort, log, prefs_name='max_covers'):
  371.         if not urls:
  372.             log('No images found for, title: %r and authors: %r'%(title, authors))
  373.             return
  374.         from threading import Thread
  375.         import time
  376.         if prefs_name:
  377.             urls = urls[:self.prefs[prefs_name]]
  378.         if get_best_cover:
  379.             urls = urls[:1]
  380.         log('Downloading %d covers'%len(urls))
  381.         workers = [Thread(target=self.download_image, args=(u, timeout, log, result_queue)) for u in urls]
  382.         for w in workers:
  383.             w.daemon = True
  384.             w.start()
  385.         alive = True
  386.         start_time = time.time()
  387.         while alive and not abort.is_set() and time.time() - start_time < timeout:
  388.             alive = False
  389.             for w in workers:
  390.                 if w.is_alive():
  391.                     alive = True
  392.                     break
  393.             abort.wait(0.1)
  394.     def download_image(self, url, timeout, log, result_queue):
  395.         try:
  396.             ans = self.browser.open_novisit(url, timeout=timeout).read()
  397.             result_queue.put((self, ans))
  398.             log('Downloaded cover from: %s'%url)
  399.         except Exception:
  400.             self.log.exception('Failed to download cover from: %r'%url)
  401.     # }}}
  402.     # Metadata API {{{
  403.     def get_book_url(self, identifiers):
  404.         '''
  405.         Return a 3-tuple or None. The 3-tuple is of the form:
  406.         (identifier_type, identifier_value, URL).
  407.         The URL is the URL for the book identified by identifiers at this
  408.         source. identifier_type, identifier_value specify the identifier
  409.         corresponding to the URL.
  410.         This URL must be browseable to by a human using a browser. It is meant
  411.         to provide a clickable link for the user to easily visit the books page
  412.         at this source.
  413.         If no URL is found, return None. This method must be quick, and
  414.         consistent, so only implement it if it is possible to construct the URL
  415.         from a known scheme given identifiers.
  416.         '''
  417.         return None
  418.     def get_book_url_name(self, idtype, idval, url):
  419.         '''
  420.         Return a human readable name from the return value of get_book_url().
  421.         '''
  422.         return self.name
  423.     def get_book_urls(self, identifiers):
  424.         '''
  425.         Override this method if you would like to return multiple urls for this book.
  426.         Return a list of 3-tuples. By default this method simply calls :func:`get_book_url`.
  427.         '''
  428.         data = self.get_book_url(identifiers)
  429.         if data is None:
  430.             return ()
  431.         return (data,)
  432.     def get_cached_cover_url(self, identifiers):
  433.         '''
  434.         Return cached cover URL for the book identified by
  435.         the identifiers dict or None if no such URL exists.
  436.         Note that this method must only return validated URLs, i.e. not URLS
  437.         that could result in a generic cover image or a not found error.
  438.         '''
  439.         return None
  440.     def identify_results_keygen(self, title=None, authors=None,
  441.             identifiers={}):
  442.         '''
  443.         Return a function that is used to generate a key that can sort Metadata
  444.         objects by their relevance given a search query (title, authors,
  445.         identifiers).
  446.         These keys are used to sort the results of a call to :meth:`identify`.
  447.         For details on the default algorithm see
  448.         :class:`InternalMetadataCompareKeyGen`. Re-implement this function in
  449.         your plugin if the default algorithm is not suitable.
  450.         '''
  451.         def keygen(mi):
  452.             return InternalMetadataCompareKeyGen(mi, self, title, authors,
  453.                 identifiers)
  454.         return keygen
  455.     def identify(self, log, result_queue, abort, title=None, authors=None,
  456.             identifiers={}, timeout=30):
  457.         '''
  458.         Identify a book by its title/author/isbn/etc.
  459.         If identifiers(s) are specified and no match is found and this metadata
  460.         source does not store all related identifiers (for example, all ISBNs
  461.         of a book), this method should retry with just the title and author
  462.         (assuming they were specified).
  463.         If this metadata source also provides covers, the URL to the cover
  464.         should be cached so that a subsequent call to the get covers API with
  465.         the same ISBN/special identifier does not need to get the cover URL
  466.         again. Use the caching API for this.
  467.         Every Metadata object put into result_queue by this method must have a
  468.         `source_relevance` attribute that is an integer indicating the order in
  469.         which the results were returned by the metadata source for this query.
  470.         This integer will be used by :meth:`compare_identify_results`. If the
  471.         order is unimportant, set it to zero for every result.
  472.         Make sure that any cover/isbn mapping information is cached before the
  473.         Metadata object is put into result_queue.
  474.         :param log: A log object, use it to output debugging information/errors
  475.         :param result_queue: A result Queue, results should be put into it.
  476.                             Each result is a Metadata object
  477.         :param abort: If abort.is_set() returns True, abort further processing
  478.                       and return as soon as possible
  479.         :param title: The title of the book, can be None
  480.         :param authors: A list of authors of the book, can be None
  481.         :param identifiers: A dictionary of other identifiers, most commonly
  482.                             {'isbn':'1234...'}
  483.         :param timeout: Timeout in seconds, no network request should hang for
  484.                         longer than timeout.
  485.         :return: None if no errors occurred, otherwise a unicode representation
  486.                  of the error suitable for showing to the user
  487.         '''
  488.         return None
  489.     def download_cover(self, log, result_queue, abort,
  490.             title=None, authors=None, identifiers={}, timeout=30, get_best_cover=False):
  491.         '''
  492.         Download a cover and put it into result_queue. The parameters all have
  493.         the same meaning as for :meth:`identify`. Put (self, cover_data) into
  494.         result_queue.
  495.         This method should use cached cover URLs for efficiency whenever
  496.         possible. When cached data is not present, most plugins simply call
  497.         identify and use its results.
  498.         If the parameter get_best_cover is True and this plugin can get
  499.         multiple covers, it should only get the "best" one.
  500.         '''
  501.         pass
  502.     # }}}
Lenguaje:
Para seńalar algunas líneas en particular, al inicio de cada línea pon: @@





© 2017 - Powered by PASTE 1.0. Traducido al ESP y modificado por DesdeLinux.net