server-tools/html_text/models/ir_fields_converter.py

75 lines
2.3 KiB
Python

# Copyright 2016-2017 Jairo Llopis <jairo.llopis@tecnativa.com>
# Copyright 2016 Tecnativa - Vicent Cubells
# License AGPL-3.0 or later (https://www.gnu.org/licenses/agpl).
import logging
from lxml import etree, html
from odoo import api, models
_logger = logging.getLogger(__name__)
class IrFieldsConverter(models.AbstractModel):
_inherit = "ir.fields.converter"
@api.model
def text_from_html(
self, html_content, max_words=None, max_chars=None, ellipsis="", fail=False
):
"""Extract text from an HTML field in a generator.
:param str html_content:
HTML contents from where to extract the text.
:param int max_words:
Maximum amount of words allowed in the resulting string.
:param int max_chars:
Maximum amount of characters allowed in the resulting string. If
you apply this limit, beware that the last word could get cut in an
unexpected place.
:param str ellipsis:
Character(s) to be appended to the end of the resulting string if
it gets truncated after applying limits set in :param:`max_words`
or :param:`max_chars`. If you want nothing applied, just set an
empty string.
:param bool fail:
If ``True``, exceptions will be raised. Otherwise, an empty string
will be returned on failure.
"""
# Parse HTML
try:
doc = html.fromstring(html_content)
except (TypeError, etree.XMLSyntaxError, etree.ParserError):
if fail:
raise
else:
_logger.exception("Failure parsing this HTML:\n%s", html_content)
return ""
# Get words
words = "".join(doc.xpath("//text()")).split()
# Truncate words
suffix = max_words and len(words) > max_words
if max_words:
words = words[:max_words]
# Get text
text = " ".join(words)
# Truncate text
suffix = suffix or max_chars and len(text) > max_chars
if max_chars:
text = text[: max_chars - (len(ellipsis) if suffix else 0)].strip()
# Append ellipsis if needed
if suffix:
text += ellipsis
return text