hass-core/homeassistant/components/scrape/sensor.py
David Beitey 7a73c6adf7
scrape: extract strings from new non-text tags (#35021)
With the upgrade to beautifulsoup4 to 4.9.0 (#34007), certain tags
(`<style>`, `<script>` and `<template>`) are no longer treated as having
text content (see
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#comments-and-other-special-strings
and reported bug https://bugs.launchpad.net/beautifulsoup/+bug/1868861)
meaning the content of these types of tags became inaccessible to HA.

Where the previous code could access `.text` on the tag, bs4 4.9 now
yields an empty string; these types of tags require accesing `.string`
instead.  This PR checks the tag name (which will aalways be lowercase
given how the parser works;
https://www.crummy.com/software/BeautifulSoup/bs4/doc/#other-parser-problems)
and applies this different access strategy to get the content of the
HTML tag.  All other tags are handled in the original manner.
2020-05-04 10:45:40 +02:00

150 lines
4.7 KiB
Python

"""Support for getting data from websites with scraping."""
import logging
from bs4 import BeautifulSoup
from requests.auth import HTTPBasicAuth, HTTPDigestAuth
import voluptuous as vol
from homeassistant.components.rest.sensor import RestData
from homeassistant.components.sensor import PLATFORM_SCHEMA
from homeassistant.const import (
CONF_AUTHENTICATION,
CONF_HEADERS,
CONF_NAME,
CONF_PASSWORD,
CONF_RESOURCE,
CONF_UNIT_OF_MEASUREMENT,
CONF_USERNAME,
CONF_VALUE_TEMPLATE,
CONF_VERIFY_SSL,
HTTP_BASIC_AUTHENTICATION,
HTTP_DIGEST_AUTHENTICATION,
)
from homeassistant.exceptions import PlatformNotReady
import homeassistant.helpers.config_validation as cv
from homeassistant.helpers.entity import Entity
_LOGGER = logging.getLogger(__name__)
CONF_ATTR = "attribute"
CONF_SELECT = "select"
CONF_INDEX = "index"
DEFAULT_NAME = "Web scrape"
DEFAULT_VERIFY_SSL = True
PLATFORM_SCHEMA = PLATFORM_SCHEMA.extend(
{
vol.Required(CONF_RESOURCE): cv.string,
vol.Required(CONF_SELECT): cv.string,
vol.Optional(CONF_ATTR): cv.string,
vol.Optional(CONF_INDEX, default=0): cv.positive_int,
vol.Optional(CONF_AUTHENTICATION): vol.In(
[HTTP_BASIC_AUTHENTICATION, HTTP_DIGEST_AUTHENTICATION]
),
vol.Optional(CONF_HEADERS): vol.Schema({cv.string: cv.string}),
vol.Optional(CONF_NAME, default=DEFAULT_NAME): cv.string,
vol.Optional(CONF_PASSWORD): cv.string,
vol.Optional(CONF_UNIT_OF_MEASUREMENT): cv.string,
vol.Optional(CONF_USERNAME): cv.string,
vol.Optional(CONF_VALUE_TEMPLATE): cv.template,
vol.Optional(CONF_VERIFY_SSL, default=DEFAULT_VERIFY_SSL): cv.boolean,
}
)
def setup_platform(hass, config, add_entities, discovery_info=None):
"""Set up the Web scrape sensor."""
name = config.get(CONF_NAME)
resource = config.get(CONF_RESOURCE)
method = "GET"
payload = None
headers = config.get(CONF_HEADERS)
verify_ssl = config.get(CONF_VERIFY_SSL)
select = config.get(CONF_SELECT)
attr = config.get(CONF_ATTR)
index = config.get(CONF_INDEX)
unit = config.get(CONF_UNIT_OF_MEASUREMENT)
username = config.get(CONF_USERNAME)
password = config.get(CONF_PASSWORD)
value_template = config.get(CONF_VALUE_TEMPLATE)
if value_template is not None:
value_template.hass = hass
if username and password:
if config.get(CONF_AUTHENTICATION) == HTTP_DIGEST_AUTHENTICATION:
auth = HTTPDigestAuth(username, password)
else:
auth = HTTPBasicAuth(username, password)
else:
auth = None
rest = RestData(method, resource, auth, headers, payload, verify_ssl)
rest.update()
if rest.data is None:
raise PlatformNotReady
add_entities(
[ScrapeSensor(rest, name, select, attr, index, value_template, unit)], True
)
class ScrapeSensor(Entity):
"""Representation of a web scrape sensor."""
def __init__(self, rest, name, select, attr, index, value_template, unit):
"""Initialize a web scrape sensor."""
self.rest = rest
self._name = name
self._state = None
self._select = select
self._attr = attr
self._index = index
self._value_template = value_template
self._unit_of_measurement = unit
@property
def name(self):
"""Return the name of the sensor."""
return self._name
@property
def unit_of_measurement(self):
"""Return the unit the value is expressed in."""
return self._unit_of_measurement
@property
def state(self):
"""Return the state of the device."""
return self._state
def update(self):
"""Get the latest data from the source and updates the state."""
self.rest.update()
if self.rest.data is None:
_LOGGER.error("Unable to retrieve data")
return
raw_data = BeautifulSoup(self.rest.data, "html.parser")
_LOGGER.debug(raw_data)
try:
if self._attr is not None:
value = raw_data.select(self._select)[self._index][self._attr]
else:
tag = raw_data.select(self._select)[self._index]
if tag.name in ("style", "script", "template"):
value = tag.string
else:
value = tag.text
_LOGGER.debug(value)
except IndexError:
_LOGGER.error("Unable to extract data from HTML")
return
if self._value_template is not None:
self._state = self._value_template.render_with_possible_json_value(
value, None
)
else:
self._state = value