Compare commits

...
Sign in to create a new pull request.

2 commits

View file

@ -28,7 +28,8 @@ from types import SimpleNamespace
class Discorss:
FEED_TIMEOUT_SECONDS = 15
HASH_HISTORY_LIMIT = 10
APP_VERSION = "0.3rc1"
APP_VERSION = "0.3rc2"
IMAGE_EXTENSIONS = (".jpg", ".jpeg", ".png", ".gif", ".webp")
def __init__(self, args=None):
if args is None:
@ -54,6 +55,7 @@ class Discorss:
# Yes, I know you "can't parse HTML with regex", but
# just watch me.
self.html_filter = re.compile(r"\<\/?([A-Za-z0-9 \:\.\-\/\"\=])*\>")
self.img_src_filter = re.compile(r'<img[^>]+src=["\']([^"\']+)["\']', re.I)
self.success_codes = [200, 201, 202, 203, 204, 205, 206]
self.app_config = {}
@ -142,29 +144,37 @@ class Discorss:
last_check,
self.now,
)
webhook = {
"embeds": [
embed = {
"title": str(latest_post["title"]),
"url": str(latest_post["link"]),
"color": 2123412,
"footer": {
"text": "DiscoRSS",
"icon_url": "https://frzn.dev/~amr/images/discorss.png",
},
"author": {
"name": str(hook["name"]),
"url": str(hook["siteurl"]),
},
"fields": [
{
"title": str(latest_post["title"]),
"url": str(latest_post["link"]),
"color": 2123412,
"footer": {
"text": "DiscoRSS",
"icon_url": "https://frzn.dev/~amr/images/discorss.png",
},
"author": {
"name": str(hook["name"]),
"url": str(hook["siteurl"]),
},
"fields": [
{
"name": "Excerpt from post:",
"value": self.get_description(latest_post),
}
],
# "timestamp": str(self.now),
"name": "Excerpt from post:",
"value": self.get_description(latest_post),
}
],
# "timestamp": str(self.now),
}
self.logger.debug(
"Checking for images in post %s from %s...",
latest_post["title"],
hook["name"],
)
image_url = self.get_image_url(latest_post)
if image_url is not None:
embed["thumbnail"] = {"url": image_url}
webhook = {
"embeds": [embed],
"attachments": [],
}
custom_header = {
@ -264,6 +274,58 @@ class Discorss:
desc = desc + str(addons)
return desc
# attempting to extract image previews from feeds which primarily feature
# images, like NASA's Picture of the Day feed
def get_image_url(self, feed):
image_candidates = []
# check the most common fields, this should catch the majority of image
# feeds' embedded urls
for media in feed.get("media_content", []):
if self.is_image_url(media.get("url"), media.get("type")):
image_candidates.append(media["url"])
for enclosure in feed.get("enclosures", []):
if self.is_image_url(enclosure.get("href"), enclosure.get("type")):
image_candidates.append(enclosure["href"])
for link in feed.get("links", []):
if self.is_image_url(link.get("href"), link.get("type")):
image_candidates.append(link["href"])
for media in feed.get("media_thumbnail", []):
if self.is_image_url(media.get("url"), media.get("type")):
image_candidates.append(media["url"])
for field in ["summary_detail", "content"]:
value = feed.get(field)
if isinstance(value, list):
values = [item.get("value", "") for item in value]
elif isinstance(value, dict):
values = [value.get("value", "")]
else:
values = []
for text in values:
match = self.img_src_filter.search(str(text))
if match and self.is_image_url(match.group(1)):
image_candidates.append(match.group(1))
self.logger.debug("Found the following image candidates in %s...", feed["name"])
for i in image_candidates:
self.logger.debug("%s", i)
if len(image_candidates) > 0:
return image_candidates[0]
return None
# a silly little helper just to validate image links
# this isn't 100% foolproof but it should work for the
# vast majority of feeds out there, unless they use some
# really weird image type like image/bpg
def is_image_url(self, url, mime_type=None):
if not url:
return False
if mime_type and str(mime_type).lower().startswith("image/"):
return True
return str(url).lower().split("?", 1)[0].endswith(self.IMAGE_EXTENSIONS)
# Some of this could go in __init__
def setup(self):
os.environ["TZ"] = "America/Toronto"
@ -274,7 +336,7 @@ class Discorss:
logging.basicConfig(
filename=self.log_file_path,
encoding="utf-8",
level=logging.WARNING,
level=logging.ERROR,
datefmt="%m/%d/%Y %H:%M:%S",
format="%(asctime)s [%(threadName)s] -> %(levelname)s: %(message)s",
)