From acb03cff271cfd434469d382eadb1e71377c0a46 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Tue, 9 Jun 2026 18:10:22 -0400 Subject: [PATCH 1/2] dev: attempting to add image thumbnail support --- discorss.py | 97 +++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 76 insertions(+), 21 deletions(-) diff --git a/discorss.py b/discorss.py index bf431b9..49a7cfd 100755 --- a/discorss.py +++ b/discorss.py @@ -28,7 +28,8 @@ from types import SimpleNamespace class Discorss: FEED_TIMEOUT_SECONDS = 15 HASH_HISTORY_LIMIT = 10 - APP_VERSION = "0.3rc1" + APP_VERSION = "0.3rc2" + IMAGE_EXTENSIONS = (".jpg", ".jpeg", ".png", ".gif", ".webp") def __init__(self, args=None): if args is None: @@ -54,6 +55,7 @@ class Discorss: # Yes, I know you "can't parse HTML with regex", but # just watch me. self.html_filter = re.compile(r"\<\/?([A-Za-z0-9 \:\.\-\/\"\=])*\>") + self.img_src_filter = re.compile(r']+src=["\']([^"\']+)["\']', re.I) self.success_codes = [200, 201, 202, 203, 204, 205, 206] self.app_config = {} @@ -142,29 +144,32 @@ class Discorss: last_check, self.now, ) - webhook = { - "embeds": [ + embed = { + "title": str(latest_post["title"]), + "url": str(latest_post["link"]), + "color": 2123412, + "footer": { + "text": "DiscoRSS", + "icon_url": "https://frzn.dev/~amr/images/discorss.png", + }, + "author": { + "name": str(hook["name"]), + "url": str(hook["siteurl"]), + }, + "fields": [ { - "title": str(latest_post["title"]), - "url": str(latest_post["link"]), - "color": 2123412, - "footer": { - "text": "DiscoRSS", - "icon_url": "https://frzn.dev/~amr/images/discorss.png", - }, - "author": { - "name": str(hook["name"]), - "url": str(hook["siteurl"]), - }, - "fields": [ - { - "name": "Excerpt from post:", - "value": self.get_description(latest_post), - } - ], - # "timestamp": str(self.now), + "name": "Excerpt from post:", + "value": self.get_description(latest_post), } ], + # "timestamp": str(self.now), + } + image_url = self.get_image_url(latest_post) + if image_url is not None: + embed["thumbnail"] = {"url": image_url} + + webhook = { + "embeds": [embed], "attachments": [], } custom_header = { @@ -264,6 +269,56 @@ class Discorss: desc = desc + str(addons) return desc + # attempting to extract image previews from feeds which primarily feature + # images, like NASA's Picture of the Day feed + def get_image_url(self, feed): + image_candidates = [] + # check the most common fields, this should catch the majority of image + # feeds' embedded urls + for media in feed.get("media_content", []): + if self.is_image_url(media.get("url"), media.get("type")): + image_candidates.append(media["url"]) + + for enclosure in feed.get("enclosures", []): + if self.is_image_url(enclosure.get("href"), enclosure.get("type")): + image_candidates.append(enclosure["href"]) + + for link in feed.get("links", []): + if self.is_image_url(link.get("href"), link.get("type")): + image_candidates.append(link["href"]) + + for media in feed.get("media_thumbnail", []): + if self.is_image_url(media.get("url"), media.get("type")): + image_candidates.append(media["url"]) + + for field in ["summary_detail", "content"]: + value = feed.get(field) + if isinstance(value, list): + values = [item.get("value", "") for item in value] + elif isinstance(value, dict): + values = [value.get("value", "")] + else: + values = [] + for text in values: + match = self.img_src_filter.search(str(text)) + if match and self.is_image_url(match.group(1)): + image_candidates.append(match.group(1)) + + if len(image_candidates) > 0: + return image_candidates[0] + return None + + # a silly little helper just to validate image links + # this isn't 100% foolproof but it should work for the + # vast majority of feeds out there, unless they use some + # really weird image type like image/bpg + def is_image_url(self, url, mime_type=None): + if not url: + return False + if mime_type and str(mime_type).lower().startswith("image/"): + return True + return str(url).lower().split("?", 1)[0].endswith(self.IMAGE_EXTENSIONS) + # Some of this could go in __init__ def setup(self): os.environ["TZ"] = "America/Toronto" From 91c39042c80021a3ce03e8f870dd52dc11d2e9b8 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Tue, 9 Jun 2026 18:36:31 -0400 Subject: [PATCH 2/2] logging: added some more logging statements --- discorss.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/discorss.py b/discorss.py index 49a7cfd..8513a97 100755 --- a/discorss.py +++ b/discorss.py @@ -164,6 +164,11 @@ class Discorss: ], # "timestamp": str(self.now), } + self.logger.debug( + "Checking for images in post %s from %s...", + latest_post["title"], + hook["name"], + ) image_url = self.get_image_url(latest_post) if image_url is not None: embed["thumbnail"] = {"url": image_url} @@ -303,7 +308,9 @@ class Discorss: match = self.img_src_filter.search(str(text)) if match and self.is_image_url(match.group(1)): image_candidates.append(match.group(1)) - + self.logger.debug("Found the following image candidates in %s...", feed["name"]) + for i in image_candidates: + self.logger.debug("%s", i) if len(image_candidates) > 0: return image_candidates[0] return None @@ -329,7 +336,7 @@ class Discorss: logging.basicConfig( filename=self.log_file_path, encoding="utf-8", - level=logging.WARNING, + level=logging.ERROR, datefmt="%m/%d/%Y %H:%M:%S", format="%(asctime)s [%(threadName)s] -> %(levelname)s: %(message)s", )