From c5cb655943cd9cf1e8d6b91a08ffda95076aa165 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Fri, 31 Jan 2025 16:27:44 -0500 Subject: [PATCH 01/44] Increased description length again to 150. --- discorss.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/discorss.py b/discorss.py index 07b80a2..a89a06c 100755 --- a/discorss.py +++ b/discorss.py @@ -24,10 +24,10 @@ log_file_name = r"/app.log" def getDescription(feed): try: tempStr = str(feed.entries[0]["summary_detail"]["value"]) - desc = tempStr[:100] if len(tempStr) > 100 else tempStr + desc = tempStr[:150] if len(tempStr) > 150 else tempStr except KeyError: tempStr = str(feed.entries[0]["description"]) - desc = tempStr[:100] if len(tempStr) > 100 else tempStr + desc = tempStr[:150] if len(tempStr) > 150 else tempStr return desc From 22e4294469658de9fa3592c7fe8955f77ea00dc1 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Fri, 31 Jan 2025 16:28:04 -0500 Subject: [PATCH 02/44] Added back correct provider, as well as author fields --- discorss.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/discorss.py b/discorss.py index a89a06c..1b3df2c 100755 --- a/discorss.py +++ b/discorss.py @@ -54,12 +54,16 @@ def main(): print(feed.entries[0]["published"], published_time, now) # Generate the webhook webhook = { - "content": "RSS Feed Update from " + str(hook["name"]), "embeds": [ { "title": str(feed.entries[0]["title"]), "url": str(feed.entries[0]["link"]), "color": 5814783, + "provider": { + "name": "DiscoRSS", + "url": "https://git.frzn.dev/amr/discorss", + }, + "author": {"name": str(hook["name"]), "url": str(hook["siteurl"])}, "fields": [ { "name": str(feed.entries[0]["title"]), From 40156c434f1299d730336041fe1ff1c785d3200c Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Fri, 31 Jan 2025 16:37:24 -0500 Subject: [PATCH 03/44] Updated README with example systemd units and updated config file. --- README.md | 60 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 17134ec..546db71 100644 --- a/README.md +++ b/README.md @@ -9,24 +9,26 @@ requests >= 2.4.2 feedparser ``` -The remaining imports should all be part of the standard Python install. To configure the script, create /etc/discorss.conf with the following structure: +The remaining imports should all be part of the standard Python install. + +## How to setup + +To configure the script, create /etc/discorss.conf with the following structure: ``` { "feeds": [ { - "name": "phoronix", + "name": "Phoronix", + "siteurl": "https://www.phoronix.com/", "url": "http://www.phoronix.com/rss.php", - "webhook": "webhook url" + "webhook": "webhook url", + "offset": -18000 }, { - "name": "pagetable", + "name": "Pagetable", + "siteurl": "https://pagetable.com", "url": "https://www.pagetable.com/?feed=rss2", - "webhook": "webhook url" - }, - { - "name": "righto", - "url": "https://www.righto.com/feeds/posts/default", "webhook": "webhook url", "offset": -18000 } @@ -34,6 +36,42 @@ The remaining imports should all be part of the standard Python install. To conf } ``` -The offset should only be required if feeds aren't showing up. This is because feedparser, in its infinite wisdom, just ignores the timezone when converting publish dates from feeds. So most feeds end up with an epoch in UTC. The offset should be the number of seconds between your time zone and UTC. This will eventually be fixed in a future update, I just need to sit down and wrangle with feedparser and datetime some more. +Create a webhook for each feed (unless you want them all to show as the same webhook for whatever reason) and make sure to add it in to the config. I have it set up with a webhook for each site, each with the site's icon and name set for the webhook which makes the messages look really nice. -To automate feed posting, create a systemd service and timer to execute the script. I will include examples soon. +The offset should only be required if feeds aren't showing up. This is because feedparser, in its infinite wisdom, just ignores the timezone when converting publish dates from feeds. So most feeds end up with an epoch in UTC. The offset should be the number of seconds between your time zone and UTC. This will eventually be fixed in a future update, I just need to sit down and wrangle with feedparser and datetime some more. All fields are mandatory, if you want to have no offset for example, set it to 0. The name and siteurl are used to create the "author" field in the Discord embed. + +## Automation + +To automate feed posting, create a systemd service and timer to execute the script. + +Use the command `systemctl --user edit --full --force discorss.service` and then paste in something like this: + +``` +[Unit] +Description=Discord RSS feeder +Wants=discorss.timer + +[Service] +Type=oneshot +ExecStart=/path/to/discorss.py + +[Install] +WantedBy=default.target +``` + +Make sure to edit the ExecStart to point to the correct location. Then we need a systemd timer to automatically fire the script. Run `systemctl --user edit --full --force discorss.timer` and then paste in this: +``` +[Unit] +Description=Timer for DiscoRSS +Requires=discorss.service + +[Timer] +Unit=discorss.service +OnCalendar=*-*-* *:00,15,30,45:30 +AccuracySec=10s + +[Install] +WantedBy=timers.target +``` + +To change how often this fires, edit the OnCalendar parameter. The config above has it firing every 15 minutes at half past the minute. Look at the systemd timer man pages for help if you want to tweak it. From da7ec6dddaa61512b42793bb2d05152842656f98 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Fri, 31 Jan 2025 23:35:02 -0500 Subject: [PATCH 04/44] Made variable names consistent. Added MPL-2.0 text header. --- discorss.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/discorss.py b/discorss.py index 1b3df2c..288aa4e 100755 --- a/discorss.py +++ b/discorss.py @@ -3,6 +3,10 @@ # SPDX-License-Identifier: MPL-2.0 # SPDX-FileCopyrightText: © 2025 A.M. Rowsell +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + # DiscoRSS: A simple RSS feed reader for Discord. Takes RSS feeds and then sends them to # webhooks. Intended to run using systemd timers. @@ -21,13 +25,17 @@ log_file_path = r"./log" log_file_name = r"/app.log" -def getDescription(feed): +def get_description(feed): try: - tempStr = str(feed.entries[0]["summary_detail"]["value"]) - desc = tempStr[:150] if len(tempStr) > 150 else tempStr + temporary_string = str(feed.entries[0]["summary_detail"]["value"]) + desc = ( + temporary_string[:150] if len(temporary_string) > 150 else temporary_string + ) except KeyError: - tempStr = str(feed.entries[0]["description"]) - desc = tempStr[:150] if len(tempStr) > 150 else tempStr + temporary_string = str(feed.entries[0]["description"]) + desc = ( + temporary_string[:150] if len(temporary_string) > 150 else temporary_string + ) return desc @@ -67,21 +75,23 @@ def main(): "fields": [ { "name": str(feed.entries[0]["title"]), - "value": getDescription(feed), + "value": get_description(feed), } ], } ], "attachments": [], } - customHeader = { - "user-agent": "DiscoRSS (https://git.frzn.dev/amr/discorss, 0.1)", + custom_header = { + "user-agent": "DiscoRSS (https://git.frzn.dev/amr/discorss, 0.2rc1)", "content-type": "application/json", } - webhookStr = json.dumps(webhook) - print(webhookStr) + webhook_string = json.dumps(webhook) + # print(webhook_string) if published_time > last_check and published_time < now: - r = requests.post(hook["webhook"], data=webhookStr, headers=customHeader) + r = requests.post( + hook["webhook"], data=webhook_string, headers=custom_header + ) app_config["lastupdate"] = now with open(config_file_path, "w") as config_file: json.dump(app_config, config_file, indent=4) From 243976511751ccc24a47ad6e464a28e3542e39a4 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Fri, 31 Jan 2025 23:35:31 -0500 Subject: [PATCH 05/44] Updated README with contribution guidelines. --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 546db71..f8f7763 100644 --- a/README.md +++ b/README.md @@ -75,3 +75,9 @@ WantedBy=timers.target ``` To change how often this fires, edit the OnCalendar parameter. The config above has it firing every 15 minutes at half past the minute. Look at the systemd timer man pages for help if you want to tweak it. + +## Contributing + +Want to fix something or make a suggestion? Feel free! If you want to send a pull request, you *must* run the Python `black` formatter on the source code before committing. I have this set up in my editor to automatically run every time I save the file, but you could have it run as part of a git hook or something. For non-format stuff, please just follow the code style as best you can. For Python code, I separate multi-word variable names with underscores. So it should be `feed_time`, not `feedTime` or `FeedTime` or `feed-time`. Don't ask me why, but I use camelCase for other languages... but in Python I've switched to underscores. + +If you know how and are able to, *please* sign your commits with the `-S` option to `git commit`. This shows that you are the author, especially if others have signed your keys. From 422a71752672dd841c1a7618d9c0b0a5d69fbdb4 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Sun, 2 Feb 2025 11:21:52 -0500 Subject: [PATCH 06/44] Changed embed colour, footer, removed datetime, added error detection --- discorss.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/discorss.py b/discorss.py index 288aa4e..c720c68 100755 --- a/discorss.py +++ b/discorss.py @@ -14,7 +14,7 @@ import requests import feedparser from pathlib import Path import json -import datetime +# import datetime import time import os @@ -66,10 +66,10 @@ def main(): { "title": str(feed.entries[0]["title"]), "url": str(feed.entries[0]["link"]), - "color": 5814783, - "provider": { + "color": 216128, + "footer": { "name": "DiscoRSS", - "url": "https://git.frzn.dev/amr/discorss", + # "url": "https://git.frzn.dev/amr/discorss", }, "author": {"name": str(hook["name"]), "url": str(hook["siteurl"])}, "fields": [ @@ -92,6 +92,8 @@ def main(): r = requests.post( hook["webhook"], data=webhook_string, headers=custom_header ) + if r.status_code != '200': + print("Error {} while trying to post {}".format(r.status_code, hook["webhook"])) app_config["lastupdate"] = now with open(config_file_path, "w") as config_file: json.dump(app_config, config_file, indent=4) From 5faf789e82c33254d9fa40e0d798798173090efc Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Sun, 2 Feb 2025 11:23:23 -0500 Subject: [PATCH 07/44] added .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f8ec04c --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.conf +*.txt +log/ \ No newline at end of file From 30756fc46267faad0af1fa7537d1e6045a34b982 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Mon, 3 Feb 2025 14:34:15 -0500 Subject: [PATCH 08/44] Added HTML filter, removed duplicate titles in embed, better output --- discorss.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/discorss.py b/discorss.py index c720c68..46224b3 100755 --- a/discorss.py +++ b/discorss.py @@ -14,25 +14,30 @@ import requests import feedparser from pathlib import Path import json -# import datetime import time import os +import re config_file_path = r"/etc/discorss.conf" # config_file_path = r"discorss.conf" -# log_file_path = r"/var/log/discorss" -log_file_path = r"./log" +log_file_path = r"/var/log/discorss" +# log_file_path = r"./log" log_file_name = r"/app.log" +# Yes, I know you "can't parse HTML with regex", but +# just watch me. +html_filter = re.compile(r"\<\/?([A-Za-z \"\=])*\>") def get_description(feed): try: temporary_string = str(feed.entries[0]["summary_detail"]["value"]) + temporary_string = html_filter.sub("", temporary_string) desc = ( temporary_string[:150] if len(temporary_string) > 150 else temporary_string ) except KeyError: temporary_string = str(feed.entries[0]["description"]) + temporary_string = html_filter.sub("", temporary_string) desc = ( temporary_string[:150] if len(temporary_string) > 150 else temporary_string ) @@ -59,7 +64,7 @@ def main(): feed = feedparser.parse(hook["url"]) published_time = time.mktime(feed.entries[0]["published_parsed"]) published_time = published_time + hook["offset"] - print(feed.entries[0]["published"], published_time, now) + print("Parsing feed {}...".format(hook["name"])) # Generate the webhook webhook = { "embeds": [ @@ -74,7 +79,7 @@ def main(): "author": {"name": str(hook["name"]), "url": str(hook["siteurl"])}, "fields": [ { - "name": str(feed.entries[0]["title"]), + "name": "Excerpt from post:", "value": get_description(feed), } ], @@ -92,8 +97,12 @@ def main(): r = requests.post( hook["webhook"], data=webhook_string, headers=custom_header ) - if r.status_code != '200': - print("Error {} while trying to post {}".format(r.status_code, hook["webhook"])) + if r.status_code != "200": + print( + "Error {} while trying to post {}".format( + r.status_code, hook["webhook"] + ) + ) app_config["lastupdate"] = now with open(config_file_path, "w") as config_file: json.dump(app_config, config_file, indent=4) From 8683d08d1c6006be6503205991c73fb27e4dd179 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Wed, 5 Feb 2025 15:29:04 -0500 Subject: [PATCH 09/44] First attempt at hash-based feed tracking --- discorss.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/discorss.py b/discorss.py index 46224b3..bd1f182 100755 --- a/discorss.py +++ b/discorss.py @@ -12,6 +12,7 @@ import requests import feedparser +import hashlib from pathlib import Path import json import time @@ -59,12 +60,20 @@ def main(): app_config = json.load(config_file) now = time.mktime(time.localtime()) last_check = app_config["lastupdate"] - for hook in app_config["feeds"]: + for i, hook in enumerate(app_config["feeds"]): # Get the feed feed = feedparser.parse(hook["url"]) published_time = time.mktime(feed.entries[0]["published_parsed"]) published_time = published_time + hook["offset"] print("Parsing feed {}...".format(hook["name"])) + new_hash = hashlib.sha3_512(feed.entries[0]["title"]) + try: + if hook["lasthash"] != new_hash: + app_config["feeds"][i]["lasthash"] = new_hash + else: + continue + except KeyError: + app_config["feeds"][i]["lasthash"] = new_hash # Generate the webhook webhook = { "embeds": [ From ec88faa437d6e995927c50372dfde540bd301653 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Wed, 5 Feb 2025 15:30:34 -0500 Subject: [PATCH 10/44] Changed HTTP status code checking to catch success codes other than 200 --- discorss.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/discorss.py b/discorss.py index 46224b3..7e49771 100755 --- a/discorss.py +++ b/discorss.py @@ -26,6 +26,7 @@ log_file_name = r"/app.log" # Yes, I know you "can't parse HTML with regex", but # just watch me. html_filter = re.compile(r"\<\/?([A-Za-z \"\=])*\>") +success_codes = ['200', '201', '202', '203', '204', '205', '206'] def get_description(feed): @@ -97,7 +98,7 @@ def main(): r = requests.post( hook["webhook"], data=webhook_string, headers=custom_header ) - if r.status_code != "200": + if r.status_code not in success_codes: print( "Error {} while trying to post {}".format( r.status_code, hook["webhook"] From bd693f6f42a11b38c23d82731486b5fb53107c2e Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Wed, 5 Feb 2025 23:27:49 -0500 Subject: [PATCH 11/44] Added check for non-existant lastupdate key --- .gitignore | 3 ++- discorss.py | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index f8ec04c..d48e085 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.conf *.txt -log/ \ No newline at end of file +log/ +*.bak \ No newline at end of file diff --git a/discorss.py b/discorss.py index bd1f182..b869e7c 100755 --- a/discorss.py +++ b/discorss.py @@ -59,7 +59,10 @@ def main(): with open(config_file_path, "r") as config_file: app_config = json.load(config_file) now = time.mktime(time.localtime()) - last_check = app_config["lastupdate"] + try: + last_check = app_config["lastupdate"] + except KeyError: + last_check = now - 21600 # first run, no lastupdate, check up to 6 hours ago for i, hook in enumerate(app_config["feeds"]): # Get the feed feed = feedparser.parse(hook["url"]) From 87193d0f9402a3578e4b6e18952338d45dcc48da Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Wed, 5 Feb 2025 23:28:14 -0500 Subject: [PATCH 12/44] Added sha3_512 hash of post title, to migrate from using time --- discorss.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/discorss.py b/discorss.py index b869e7c..7eb752e 100755 --- a/discorss.py +++ b/discorss.py @@ -69,7 +69,8 @@ def main(): published_time = time.mktime(feed.entries[0]["published_parsed"]) published_time = published_time + hook["offset"] print("Parsing feed {}...".format(hook["name"])) - new_hash = hashlib.sha3_512(feed.entries[0]["title"]) + # Hash the title of the latest post and use that to determine if it's been posted + new_hash = hashlib.sha3_512(bytes(feed.entries[0]["title"], 'utf-8')).hexdigest() try: if hook["lasthash"] != new_hash: app_config["feeds"][i]["lasthash"] = new_hash From a188f8ee5d129842842f52297548753b36fcd536 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Wed, 5 Feb 2025 23:40:27 -0500 Subject: [PATCH 13/44] Removed part of time check --- discorss.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/discorss.py b/discorss.py index 7eb752e..6cff9c7 100755 --- a/discorss.py +++ b/discorss.py @@ -105,8 +105,8 @@ def main(): "content-type": "application/json", } webhook_string = json.dumps(webhook) - # print(webhook_string) - if published_time > last_check and published_time < now: + + if published_time > last_check: r = requests.post( hook["webhook"], data=webhook_string, headers=custom_header ) From 630560405624b9716c04c0484a8f64379ceccd9f Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Thu, 6 Feb 2025 04:53:55 -0500 Subject: [PATCH 14/44] Updated gitignore to ignore pyvenv files --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d48e085..fed12ea 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ *.conf *.txt log/ -*.bak \ No newline at end of file +*.bak +bin/ +lib/ +*.cfg \ No newline at end of file From d15578f7b2aef7220337fb06f0f827c131d6c129 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Fri, 7 Feb 2025 20:24:14 -0500 Subject: [PATCH 15/44] Changed formatting, added some comments, moved config file location --- discorss.py | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/discorss.py b/discorss.py index 3cd9ce6..be76e69 100755 --- a/discorss.py +++ b/discorss.py @@ -19,29 +19,41 @@ import time import os import re -config_file_path = r"/etc/discorss.conf" -# config_file_path = r"discorss.conf" +config_dir = os.environ.get('XDG_CONFIG_HOME') +if config_dir is None: + config_file_path = r"~/.config/discorss/discorss.conf" + config_dir = r"~/.config/discorss" +else: + config_file_path = config_dir + r"/discorss/discorss.conf" log_file_path = r"/var/log/discorss" # log_file_path = r"./log" log_file_name = r"/app.log" # Yes, I know you "can't parse HTML with regex", but # just watch me. html_filter = re.compile(r"\<\/?([A-Za-z \"\=])*\>") -success_codes = ['200', '201', '202', '203', '204', '205', '206'] +success_codes = ["200", "201", "202", "203", "204", "205", "206"] +# This function gets and formats the brief excerpt that goes in the embed +# Different feeds put summaries in different fields, so we pick the best +# one and limit it to 150 characters. +# TODO: make the character limit smarter, as to split at a natural point def get_description(feed): try: temporary_string = str(feed.entries[0]["summary_detail"]["value"]) temporary_string = html_filter.sub("", temporary_string) desc = ( - temporary_string[:150] if len(temporary_string) > 150 else temporary_string + temporary_string[:150] + if len(temporary_string) > 150 + else temporary_string ) except KeyError: temporary_string = str(feed.entries[0]["description"]) temporary_string = html_filter.sub("", temporary_string) desc = ( - temporary_string[:150] if len(temporary_string) > 150 else temporary_string + temporary_string[:150] + if len(temporary_string) > 150 + else temporary_string ) return desc @@ -49,13 +61,14 @@ def get_description(feed): def main(): os.environ["TZ"] = "America/Toronto" time.tzset() + # Check for log and config files/paths, create empty directories if needed try: Path(log_file_path).mkdir(parents=True, exist_ok=True) except FileExistsError: - print("This path already exists and is not a directory!") - # Load and read the config file + print("The logfile path {} already exists and is not a directory!".format(log_file_path)) if not Path(config_file_path).exists(): - print("No config file! Snarf. Directories were created for you.") + print("No config file at {}! Snarf.\n{} was created for you.".format(config_file_path, config_dir)) + Path(config_file_path).mkdir(parents=True, exist_ok=True) return with open(config_file_path, "r") as config_file: app_config = json.load(config_file) @@ -63,7 +76,9 @@ def main(): try: last_check = app_config["lastupdate"] except KeyError: - last_check = now - 21600 # first run, no lastupdate, check up to 6 hours ago + last_check = ( + now - 21600 + ) # first run, no lastupdate, check up to 6 hours ago for i, hook in enumerate(app_config["feeds"]): # Get the feed feed = feedparser.parse(hook["url"]) @@ -71,7 +86,9 @@ def main(): published_time = published_time + hook["offset"] print("Parsing feed {}...".format(hook["name"])) # Hash the title of the latest post and use that to determine if it's been posted - new_hash = hashlib.sha3_512(bytes(feed.entries[0]["title"], 'utf-8')).hexdigest() + new_hash = hashlib.sha3_512( + bytes(feed.entries[0]["title"], "utf-8") + ).hexdigest() try: if hook["lasthash"] != new_hash: app_config["feeds"][i]["lasthash"] = new_hash @@ -90,7 +107,10 @@ def main(): "name": "DiscoRSS", # "url": "https://git.frzn.dev/amr/discorss", }, - "author": {"name": str(hook["name"]), "url": str(hook["siteurl"])}, + "author": { + "name": str(hook["name"]), + "url": str(hook["siteurl"]), + }, "fields": [ { "name": "Excerpt from post:", From 6709c14b1d865dc9064d77972893b2ab4e4fdc15 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Sun, 9 Feb 2025 12:40:29 -0500 Subject: [PATCH 16/44] Added logo to README --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index f8f7763..03c17cd 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,7 @@ # DiscoRSS +![DiscoRSS Logo](https://frzn.dev/~amr/images/discorss.png) + ## What is it? DiscoRSS is a simple Python script to send RSS feeds to Discord webhooks. It was created because existing bots that did this set limits on the number of feeds, and self-hosting stuff is easier and better anyway. To get this working, you will require the following Python libraries: From bb3edcbfc71d0109f2e217bb3ef63d6b4bd5f28b Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Thu, 13 Feb 2025 21:16:55 -0500 Subject: [PATCH 17/44] Various small fixes, see full commit message * Changed the preview length to 250 characters * Changed the version in the user-agent string * Cleaned up paths, switching to user paths * Updated html regex filter to correctly remove : . / * Various formatting changes from black * Wrapped published_time in try/except --- discorss.py | 49 ++++++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/discorss.py b/discorss.py index be76e69..4fe079b 100755 --- a/discorss.py +++ b/discorss.py @@ -19,10 +19,11 @@ import time import os import re -config_dir = os.environ.get('XDG_CONFIG_HOME') +config_dir = os.environ.get("XDG_CONFIG_HOME") +home_dir = Path.home() if config_dir is None: - config_file_path = r"~/.config/discorss/discorss.conf" - config_dir = r"~/.config/discorss" + config_file_path = str(home_dir) + "/.config/discorss/discorss.conf" + config_dir = str(home_dir) + "/.config/discorss" else: config_file_path = config_dir + r"/discorss/discorss.conf" log_file_path = r"/var/log/discorss" @@ -30,30 +31,26 @@ log_file_path = r"/var/log/discorss" log_file_name = r"/app.log" # Yes, I know you "can't parse HTML with regex", but # just watch me. -html_filter = re.compile(r"\<\/?([A-Za-z \"\=])*\>") +html_filter = re.compile(r"\<\/?([A-Za-z \:\.\/\"\=])*\>") success_codes = ["200", "201", "202", "203", "204", "205", "206"] # This function gets and formats the brief excerpt that goes in the embed # Different feeds put summaries in different fields, so we pick the best -# one and limit it to 150 characters. +# one and limit it to 250 characters. # TODO: make the character limit smarter, as to split at a natural point def get_description(feed): try: temporary_string = str(feed.entries[0]["summary_detail"]["value"]) temporary_string = html_filter.sub("", temporary_string) desc = ( - temporary_string[:150] - if len(temporary_string) > 150 - else temporary_string + temporary_string[:250] if len(temporary_string) > 250 else temporary_string ) except KeyError: temporary_string = str(feed.entries[0]["description"]) temporary_string = html_filter.sub("", temporary_string) desc = ( - temporary_string[:150] - if len(temporary_string) > 150 - else temporary_string + temporary_string[:250] if len(temporary_string) > 250 else temporary_string ) return desc @@ -62,13 +59,22 @@ def main(): os.environ["TZ"] = "America/Toronto" time.tzset() # Check for log and config files/paths, create empty directories if needed + # TODO: make this cleaner try: Path(log_file_path).mkdir(parents=True, exist_ok=True) except FileExistsError: - print("The logfile path {} already exists and is not a directory!".format(log_file_path)) + print( + "The logfile path {} already exists and is not a directory!".format( + log_file_path + ) + ) if not Path(config_file_path).exists(): - print("No config file at {}! Snarf.\n{} was created for you.".format(config_file_path, config_dir)) - Path(config_file_path).mkdir(parents=True, exist_ok=True) + print( + "No config file at {}! Snarf.\n{} was created for you.".format( + config_file_path, config_dir + ) + ) + Path(config_dir).mkdir(parents=True, exist_ok=True) return with open(config_file_path, "r") as config_file: app_config = json.load(config_file) @@ -76,15 +82,16 @@ def main(): try: last_check = app_config["lastupdate"] except KeyError: - last_check = ( - now - 21600 - ) # first run, no lastupdate, check up to 6 hours ago + last_check = now - 21600 # first run, no lastupdate, check up to 6 hours ago for i, hook in enumerate(app_config["feeds"]): # Get the feed - feed = feedparser.parse(hook["url"]) - published_time = time.mktime(feed.entries[0]["published_parsed"]) - published_time = published_time + hook["offset"] print("Parsing feed {}...".format(hook["name"])) + feed = feedparser.parse(hook["url"]) + try: + published_time = time.mktime(feed.entries[0]["published_parsed"]) + published_time = published_time + hook["offset"] + except KeyError: + published_time = now - 10 # Not sure what a sensible default here is # Hash the title of the latest post and use that to determine if it's been posted new_hash = hashlib.sha3_512( bytes(feed.entries[0]["title"], "utf-8") @@ -122,7 +129,7 @@ def main(): "attachments": [], } custom_header = { - "user-agent": "DiscoRSS (https://git.frzn.dev/amr/discorss, 0.2rc1)", + "user-agent": "DiscoRSS (https://git.frzn.dev/amr/discorss, 0.2rc2)", "content-type": "application/json", } webhook_string = json.dumps(webhook) From 8229a14cfe944a19e0dc29335d7aa85e4f6fc57a Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Tue, 25 Feb 2025 18:13:01 -0500 Subject: [PATCH 18/44] Cleaned up file/dir paths a bit, make app_config global app_config is global pending the next commit which will separate out all the config file handling from main() as the LSP was saying the main function had become too complex. --- discorss.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/discorss.py b/discorss.py index 4fe079b..83c71bd 100755 --- a/discorss.py +++ b/discorss.py @@ -17,6 +17,7 @@ from pathlib import Path import json import time import os +import sys import re config_dir = os.environ.get("XDG_CONFIG_HOME") @@ -26,13 +27,13 @@ if config_dir is None: config_dir = str(home_dir) + "/.config/discorss" else: config_file_path = config_dir + r"/discorss/discorss.conf" -log_file_path = r"/var/log/discorss" -# log_file_path = r"./log" -log_file_name = r"/app.log" +log_dir = r"/var/log/discorss" +log_file_path = r"/app.log" # Yes, I know you "can't parse HTML with regex", but # just watch me. html_filter = re.compile(r"\<\/?([A-Za-z \:\.\/\"\=])*\>") success_codes = ["200", "201", "202", "203", "204", "205", "206"] +app_config = {} # This function gets and formats the brief excerpt that goes in the embed From a1a6998e5299d80c20465c2ff8b1d174a9cce1d4 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Tue, 25 Feb 2025 18:14:37 -0500 Subject: [PATCH 19/44] Split out config file handling into its own function --- discorss.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/discorss.py b/discorss.py index 83c71bd..f0ee41f 100755 --- a/discorss.py +++ b/discorss.py @@ -56,30 +56,43 @@ def get_description(feed): return desc -def main(): - os.environ["TZ"] = "America/Toronto" - time.tzset() +def setupPaths(): + global app_config # Check for log and config files/paths, create empty directories if needed # TODO: make this cleaner - try: - Path(log_file_path).mkdir(parents=True, exist_ok=True) - except FileExistsError: - print( - "The logfile path {} already exists and is not a directory!".format( - log_file_path - ) - ) + if not Path(log_file_path).exists(): + print("No log file path exists. Yark! We'll try and make {}...", log_dir) + try: + Path(log_dir).mkdir(parents=True, exist_ok=True) + except FileExistsError: + print("The path {} already exists and is not a directory!".format(log_dir)) if not Path(config_file_path).exists(): print( - "No config file at {}! Snarf.\n{} was created for you.".format( + "No config file at {}! Snarf. We'll try and make {}...".format( config_file_path, config_dir ) ) - Path(config_dir).mkdir(parents=True, exist_ok=True) + try: + Path(config_dir).mkdir(parents=True, exist_ok=True) + except FileExistsError: + print( + "The config dir {} already exists and is not a directory! Please fix manually.".format( + config_dir + ) + ) + sys.exit(255) return + # Loading the config file with open(config_file_path, "r") as config_file: app_config = json.load(config_file) + return + + +def main(): + os.environ["TZ"] = "America/Toronto" + time.tzset() now = time.mktime(time.localtime()) + setupPaths() # Handle the config and log paths try: last_check = app_config["lastupdate"] except KeyError: From 3def57a9334f03c9fb8877c3b11d7337c95cc12e Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Tue, 25 Feb 2025 18:15:10 -0500 Subject: [PATCH 20/44] Quick and dirty hack to check for latest post in a feed This is needed in case the feed is not in reverse chronological order, like most feeds. This needs testing still. --- discorss.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/discorss.py b/discorss.py index f0ee41f..c36b49a 100755 --- a/discorss.py +++ b/discorss.py @@ -100,16 +100,25 @@ def main(): for i, hook in enumerate(app_config["feeds"]): # Get the feed print("Parsing feed {}...".format(hook["name"])) - feed = feedparser.parse(hook["url"]) - try: - published_time = time.mktime(feed.entries[0]["published_parsed"]) - published_time = published_time + hook["offset"] - except KeyError: - published_time = now - 10 # Not sure what a sensible default here is + feeds = feedparser.parse(hook["url"]) + latest_post = [] + prev_best = 0 + for feed in feeds: + try: + published_time = time.mktime(feed["published_parsed"]) + published_time = published_time + hook["offset"] + except KeyError: + published_time = feed["published"] + print(published_time) + sys.exit(254) + if published_time > prev_best: + latest_post = feed + prev_best = published_time + else: + continue + # Hash the title of the latest post and use that to determine if it's been posted - new_hash = hashlib.sha3_512( - bytes(feed.entries[0]["title"], "utf-8") - ).hexdigest() + new_hash = hashlib.sha3_512(bytes(latest_post["title"], "utf-8")).hexdigest() try: if hook["lasthash"] != new_hash: app_config["feeds"][i]["lasthash"] = new_hash @@ -121,8 +130,8 @@ def main(): webhook = { "embeds": [ { - "title": str(feed.entries[0]["title"]), - "url": str(feed.entries[0]["link"]), + "title": str(latest_post["title"]), + "url": str(latest_post["link"]), "color": 216128, "footer": { "name": "DiscoRSS", @@ -135,7 +144,7 @@ def main(): "fields": [ { "name": "Excerpt from post:", - "value": get_description(feed), + "value": get_description(latest_post), } ], } From a263f5cb931b7afbc38083aa5cf4598443ad3f74 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Tue, 25 Feb 2025 20:56:37 -0500 Subject: [PATCH 21/44] FEATURE: Added logging, finally! Now the log_dir and log_file_path actually do something useful. --- discorss.py | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/discorss.py b/discorss.py index c36b49a..94d3dec 100755 --- a/discorss.py +++ b/discorss.py @@ -13,6 +13,7 @@ import requests import feedparser import hashlib +import logging from pathlib import Path import json import time @@ -58,10 +59,11 @@ def get_description(feed): def setupPaths(): global app_config + global logger # Check for log and config files/paths, create empty directories if needed # TODO: make this cleaner - if not Path(log_file_path).exists(): - print("No log file path exists. Yark! We'll try and make {}...", log_dir) + if not Path(log_dir).exists(): + print("No log file path exists. Yark! We'll try and make {}...".format(log_dir)) try: Path(log_dir).mkdir(parents=True, exist_ok=True) except FileExistsError: @@ -85,6 +87,15 @@ def setupPaths(): # Loading the config file with open(config_file_path, "r") as config_file: app_config = json.load(config_file) + # Set up logging + logger = logging.getLogger(__name__) + logging.basicConfig( + filename=str(log_dir + log_file_path), + encoding="utf-8", + level=logging.INFO, + datefmt="%m/%d/%Y %H:%M:%S", + format="%(asctime)s: %(levelname)s: %(message)s", + ) return @@ -99,24 +110,28 @@ def main(): last_check = now - 21600 # first run, no lastupdate, check up to 6 hours ago for i, hook in enumerate(app_config["feeds"]): # Get the feed - print("Parsing feed {}...".format(hook["name"])) + logger.info("Parsing feed %s...", hook["name"]) feeds = feedparser.parse(hook["url"]) latest_post = [] prev_best = 0 - for feed in feeds: + for feed in feeds["entries"]: try: + bad_time = False published_time = time.mktime(feed["published_parsed"]) published_time = published_time + hook["offset"] except KeyError: - published_time = feed["published"] - print(published_time) - sys.exit(254) + published_time = time.mktime(feed["updated_parsed"]) + bad_time = True if published_time > prev_best: latest_post = feed prev_best = published_time else: continue - + if bad_time is True: + logger.warning( + "Feed %s doesn't supply a published time, using updated time instead", + hook["name"], + ) # Hash the title of the latest post and use that to determine if it's been posted new_hash = hashlib.sha3_512(bytes(latest_post["title"], "utf-8")).hexdigest() try: @@ -126,7 +141,16 @@ def main(): continue except KeyError: app_config["feeds"][i]["lasthash"] = new_hash + logger.info( + "Feed %s has no existing hash, likely a new feed!", hook["name"] + ) # Generate the webhook + logger.info( + "Publishing webhook for %s. Last check was %d, now is %d", + hook["name"], + last_check, + now, + ) webhook = { "embeds": [ { @@ -157,9 +181,10 @@ def main(): } webhook_string = json.dumps(webhook) - if published_time > last_check: - r = requests.post( - hook["webhook"], data=webhook_string, headers=custom_header + r = requests.post(hook["webhook"], data=webhook_string, headers=custom_header) + if r.status_code not in success_codes: + logger.error( + "Error %d while trying to post %s", r.status_code, hook["webhook"] ) if r.status_code not in success_codes: print( From 0a22cfe4ee8d3417dd682da1fd7d002e6afaa07c Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Tue, 25 Feb 2025 20:57:40 -0500 Subject: [PATCH 22/44] A few small cleanup items --- discorss.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/discorss.py b/discorss.py index 94d3dec..357537c 100755 --- a/discorss.py +++ b/discorss.py @@ -43,13 +43,13 @@ app_config = {} # TODO: make the character limit smarter, as to split at a natural point def get_description(feed): try: - temporary_string = str(feed.entries[0]["summary_detail"]["value"]) + temporary_string = str(feed["summary_detail"]["value"]) temporary_string = html_filter.sub("", temporary_string) desc = ( temporary_string[:250] if len(temporary_string) > 250 else temporary_string ) except KeyError: - temporary_string = str(feed.entries[0]["description"]) + temporary_string = str(feed["description"]) temporary_string = html_filter.sub("", temporary_string) desc = ( temporary_string[:250] if len(temporary_string) > 250 else temporary_string @@ -186,15 +186,11 @@ def main(): logger.error( "Error %d while trying to post %s", r.status_code, hook["webhook"] ) - if r.status_code not in success_codes: - print( - "Error {} while trying to post {}".format( - r.status_code, hook["webhook"] - ) - ) - app_config["lastupdate"] = now - with open(config_file_path, "w") as config_file: - json.dump(app_config, config_file, indent=4) + + # End of feed loop + app_config["lastupdate"] = now + with open(config_file_path, "w") as config_file: + json.dump(app_config, config_file, indent=4) return From dd6553a6f195da45700b5d02a8bb86d8f6f30f30 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Tue, 25 Feb 2025 21:02:59 -0500 Subject: [PATCH 23/44] Updates to README to explain recent changes and logging --- README.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 03c17cd..c304a2c 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,13 @@ requests >= 2.4.2 feedparser ``` -The remaining imports should all be part of the standard Python install. +The remaining imports should all be part of the standard Python install. + +## Important Notes + +As it currently is written, the script uses the hash of the post title to prevent sending duplicates. However, a recent change to check for the publish time was added, only because some feeds are not in reverse chronological order (latest post at top of feed, ie, entry index 0). Because of this, we do actually need to check the publish times. This still needs some testing and things might be a bit broken because of it. If you see any issues please let me know. + +Logging was recently enabled. Make sure that the user running the script (especially when using systemd timers) has write access to the /var/log/discorss directory. The app will try and create the directory for you, but if your user doesn't have permissions to create directories in /var/log this will fail and this will probably crash the script as is. I will try and remember to catch that exception and exit gracefully with an error message to stdout. If you want the logs to go somewhere else, just edit the log_dir variable near the top of discorss.py. Choose a directory that makes sense. Unfortunately, as far as I know, the XDG standards don't have an equivalent to the /var/log directory in the user directory, so I wasn't sure what the best default was. In the future, we may switch to logging using systemd and journald directly, though it is nice to have a separate file. ## How to setup @@ -40,7 +46,7 @@ To configure the script, create /etc/discorss.conf with the following structure: Create a webhook for each feed (unless you want them all to show as the same webhook for whatever reason) and make sure to add it in to the config. I have it set up with a webhook for each site, each with the site's icon and name set for the webhook which makes the messages look really nice. -The offset should only be required if feeds aren't showing up. This is because feedparser, in its infinite wisdom, just ignores the timezone when converting publish dates from feeds. So most feeds end up with an epoch in UTC. The offset should be the number of seconds between your time zone and UTC. This will eventually be fixed in a future update, I just need to sit down and wrangle with feedparser and datetime some more. All fields are mandatory, if you want to have no offset for example, set it to 0. The name and siteurl are used to create the "author" field in the Discord embed. +The offset should only be required if feeds aren't showing up. This is because feedparser, in its infinite wisdom, just ignores the timezone when converting publish dates from feeds. So most feeds end up with an epoch in UTC. The offset should be the number of seconds between your time zone and UTC. This will eventually be fixed in a future update, I just need to sit down and wrangle with feedparser and datetime some more. All fields are mandatory, if you want to have no offset for example, set it to 0. The name and siteurl are used to create the "author" field in the Discord embed. ## Automation From 87a376c5e8e4f3609ed381b7398102c235ec40c4 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Wed, 26 Feb 2025 17:41:33 -0500 Subject: [PATCH 24/44] Fixed path in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c304a2c..8fa07b9 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Logging was recently enabled. Make sure that the user running the script (especi ## How to setup -To configure the script, create /etc/discorss.conf with the following structure: +To configure the script, create ~/.config/discorss/discorss.conf with the following structure: ``` { From 7d84c7c257a74b158451d4344bed09c03b0d5c80 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Wed, 26 Feb 2025 17:44:39 -0500 Subject: [PATCH 25/44] Bump version to v0.2rc3 --- discorss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discorss.py b/discorss.py index 357537c..67da27d 100755 --- a/discorss.py +++ b/discorss.py @@ -176,7 +176,7 @@ def main(): "attachments": [], } custom_header = { - "user-agent": "DiscoRSS (https://git.frzn.dev/amr/discorss, 0.2rc2)", + "user-agent": "DiscoRSS (https://git.frzn.dev/amr/discorss, 0.2rc3)", "content-type": "application/json", } webhook_string = json.dumps(webhook) From 1c78edd38e9cde6b8a615f3033c448430468e9e3 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Wed, 26 Feb 2025 20:12:29 -0500 Subject: [PATCH 26/44] Fixed status code checking --- discorss.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/discorss.py b/discorss.py index 67da27d..55b0b98 100755 --- a/discorss.py +++ b/discorss.py @@ -33,7 +33,7 @@ log_file_path = r"/app.log" # Yes, I know you "can't parse HTML with regex", but # just watch me. html_filter = re.compile(r"\<\/?([A-Za-z \:\.\/\"\=])*\>") -success_codes = ["200", "201", "202", "203", "204", "205", "206"] +success_codes = [200, 201, 202, 203, 204, 205, 206] app_config = {} @@ -108,8 +108,7 @@ def main(): last_check = app_config["lastupdate"] except KeyError: last_check = now - 21600 # first run, no lastupdate, check up to 6 hours ago - for i, hook in enumerate(app_config["feeds"]): - # Get the feed + for i, hook in enumerate(app_config["feeds"]): # Feed loop start logger.info("Parsing feed %s...", hook["name"]) feeds = feedparser.parse(hook["url"]) latest_post = [] From 8ff64608cdf612c9f0fe420a2dadcdde6b949e71 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Mon, 3 Mar 2025 07:41:41 -0500 Subject: [PATCH 27/44] Fixed html regex filter. Edited some logging types. See full msg. The HTML regex wasn't working because I was missing some really obvious capture groups. The regex filter is really only for the kernel.org Releases Feed, just to make it look a bit cleaner. We don't actually need the direct links because the post's title already links directly to the front page. We mostly just want to know there's been a release. Some logging parameters were changed to make the log less cluttered by default. Going to write a logrotate config that will be included in the README or something to help people with rotating the logs automatically. Also added a few small clarifying comments, as well as an IDEA! --- discorss.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/discorss.py b/discorss.py index 55b0b98..e7f27d5 100755 --- a/discorss.py +++ b/discorss.py @@ -32,10 +32,14 @@ log_dir = r"/var/log/discorss" log_file_path = r"/app.log" # Yes, I know you "can't parse HTML with regex", but # just watch me. -html_filter = re.compile(r"\<\/?([A-Za-z \:\.\/\"\=])*\>") +html_filter = re.compile(r"\<\/?([A-Za-z0-9 \:\.\-\/\"\=])*\>") success_codes = [200, 201, 202, 203, 204, 205, 206] app_config = {} +# IDEA: Consider making this into a class-based program +# This would solve a couple issues around global variables and generally +# make things a bit neater + # This function gets and formats the brief excerpt that goes in the embed # Different feeds put summaries in different fields, so we pick the best @@ -109,7 +113,7 @@ def main(): except KeyError: last_check = now - 21600 # first run, no lastupdate, check up to 6 hours ago for i, hook in enumerate(app_config["feeds"]): # Feed loop start - logger.info("Parsing feed %s...", hook["name"]) + logger.debug("Parsing feed %s...", hook["name"]) feeds = feedparser.parse(hook["url"]) latest_post = [] prev_best = 0 @@ -183,10 +187,14 @@ def main(): r = requests.post(hook["webhook"], data=webhook_string, headers=custom_header) if r.status_code not in success_codes: logger.error( - "Error %d while trying to post %s", r.status_code, hook["webhook"] + "Error %d while trying to post %s", r.status_code, hook["name"] ) + else: + logger.debug("Got %d when posting %s", r.status_code, hook["name"]) # End of feed loop + + # Dump updated config back to json file app_config["lastupdate"] = now with open(config_file_path, "w") as config_file: json.dump(app_config, config_file, indent=4) From 8129da759f1c74dee55e3ea3cdd1a6c3b83de844 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Tue, 4 Mar 2025 16:27:06 -0500 Subject: [PATCH 28/44] Trying to make description cutoff smarter. Changed hashing. Hashing now takes the sha3_512 hash of both the title and the published time, because some feeds (like weather alerts) will have the same title all the time, for every entry. The description cutoff now goes backwards until it finds a space character, then it sets the cutoff there. Also, the length can now be passed as a parameter, with default value of 250. Might also add minimum length as a parameter too. --- discorss.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/discorss.py b/discorss.py index e7f27d5..d33f387 100755 --- a/discorss.py +++ b/discorss.py @@ -45,18 +45,32 @@ app_config = {} # Different feeds put summaries in different fields, so we pick the best # one and limit it to 250 characters. # TODO: make the character limit smarter, as to split at a natural point -def get_description(feed): +def get_description(feed, length=250): try: temporary_string = str(feed["summary_detail"]["value"]) temporary_string = html_filter.sub("", temporary_string) + while length > 150: + if temporary_string[length - 1 : length] == " ": + break + else: + length -= 1 desc = ( - temporary_string[:250] if len(temporary_string) > 250 else temporary_string + temporary_string[:length] + if len(temporary_string) > length + else temporary_string ) except KeyError: temporary_string = str(feed["description"]) temporary_string = html_filter.sub("", temporary_string) + while length > 150: + if temporary_string[length - 1 : length] == " ": + break + else: + length -= 1 desc = ( - temporary_string[:250] if len(temporary_string) > 250 else temporary_string + temporary_string[:length] + if len(temporary_string) > length + else temporary_string ) return desc @@ -135,8 +149,10 @@ def main(): "Feed %s doesn't supply a published time, using updated time instead", hook["name"], ) - # Hash the title of the latest post and use that to determine if it's been posted - new_hash = hashlib.sha3_512(bytes(latest_post["title"], "utf-8")).hexdigest() + # Hash the title and time of the latest post and use that to determine if it's been posted + new_hash = hashlib.sha3_512( + bytes(latest_post["title"] + str(published_time), "utf-8") + ).hexdigest() try: if hook["lasthash"] != new_hash: app_config["feeds"][i]["lasthash"] = new_hash From 9a5c4616e38c004ef71d25fe5ecaf8d29568a72d Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Tue, 4 Mar 2025 16:36:01 -0500 Subject: [PATCH 29/44] Finished description cutoff detection. Added min_length parameter, as well as an addon parameter that might be used in the future to add extra text to the description where needed. Next up will be checking for media in the entry and adding a second embed field or attachment so the media can be previewed or listened to/watched right in the Discord post. --- discorss.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/discorss.py b/discorss.py index d33f387..b0163f0 100755 --- a/discorss.py +++ b/discorss.py @@ -44,34 +44,25 @@ app_config = {} # This function gets and formats the brief excerpt that goes in the embed # Different feeds put summaries in different fields, so we pick the best # one and limit it to 250 characters. -# TODO: make the character limit smarter, as to split at a natural point -def get_description(feed, length=250): +def get_description(feed, length=250, min_length=150, addons=None): try: temporary_string = str(feed["summary_detail"]["value"]) temporary_string = html_filter.sub("", temporary_string) - while length > 150: + while length > min_length: if temporary_string[length - 1 : length] == " ": break else: length -= 1 - desc = ( - temporary_string[:length] - if len(temporary_string) > length - else temporary_string - ) + desc = temporary_string[:length] except KeyError: temporary_string = str(feed["description"]) temporary_string = html_filter.sub("", temporary_string) - while length > 150: + while length > min_length: if temporary_string[length - 1 : length] == " ": break else: length -= 1 - desc = ( - temporary_string[:length] - if len(temporary_string) > length - else temporary_string - ) + desc = temporary_string[:length] return desc From 457e2c33150ee0566ad72b8797cc7f7bcfc114d4 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Thu, 6 Mar 2025 20:11:51 -0500 Subject: [PATCH 30/44] feat: rewrote get_description to allow extra text Now if you want to add something to the end of the description, you can pass it via the addons parameter. Also moved a line that was duplicated to reduce the function length. --- discorss.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/discorss.py b/discorss.py index b0163f0..555c3e2 100755 --- a/discorss.py +++ b/discorss.py @@ -53,7 +53,6 @@ def get_description(feed, length=250, min_length=150, addons=None): break else: length -= 1 - desc = temporary_string[:length] except KeyError: temporary_string = str(feed["description"]) temporary_string = html_filter.sub("", temporary_string) @@ -62,7 +61,10 @@ def get_description(feed, length=250, min_length=150, addons=None): break else: length -= 1 - desc = temporary_string[:length] + + desc = temporary_string[:length] + if addons is not None: + desc = desc + str(addons) return desc From b72f1d72910e0d0c2310445a7f14bf379df4ef1d Mon Sep 17 00:00:00 2001 From: RoscoeDaWah Date: Fri, 14 Mar 2025 10:50:45 +0000 Subject: [PATCH 31/44] docs: add syntax highlighting to README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8fa07b9..06b28fe 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ Logging was recently enabled. Make sure that the user running the script (especi To configure the script, create ~/.config/discorss/discorss.conf with the following structure: -``` +```json { "feeds": [ { @@ -54,7 +54,7 @@ To automate feed posting, create a systemd service and timer to execute the scri Use the command `systemctl --user edit --full --force discorss.service` and then paste in something like this: -``` +```systemd [Unit] Description=Discord RSS feeder Wants=discorss.timer @@ -68,7 +68,7 @@ WantedBy=default.target ``` Make sure to edit the ExecStart to point to the correct location. Then we need a systemd timer to automatically fire the script. Run `systemctl --user edit --full --force discorss.timer` and then paste in this: -``` +```systemd [Unit] Description=Timer for DiscoRSS Requires=discorss.service From ce71ef1e81b5dc391c180a9155a39a35bb13b60a Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Sun, 16 Mar 2025 02:20:10 -0400 Subject: [PATCH 32/44] chore: Changed warning text, and some logging values --- discorss.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/discorss.py b/discorss.py index 555c3e2..56d5d16 100755 --- a/discorss.py +++ b/discorss.py @@ -89,7 +89,7 @@ def setupPaths(): Path(config_dir).mkdir(parents=True, exist_ok=True) except FileExistsError: print( - "The config dir {} already exists and is not a directory! Please fix manually.".format( + "The config dir {} already exists and is not a directory! Please fix manually. Quitting!".format( config_dir ) ) @@ -138,11 +138,12 @@ def main(): else: continue if bad_time is True: - logger.warning( + logger.debug( "Feed %s doesn't supply a published time, using updated time instead", hook["name"], ) # Hash the title and time of the latest post and use that to determine if it's been posted + # Yes, SHA3-512 is totally unnecessary for this purpose, but I love SHA3 new_hash = hashlib.sha3_512( bytes(latest_post["title"] + str(published_time), "utf-8") ).hexdigest() From 0de8e237a0351e7e999611e8b27de652033e2bf7 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Wed, 16 Apr 2025 16:55:50 -0400 Subject: [PATCH 33/44] feat: created installation helper script This script will create a basic systemd user service and timer, and then optionally activate them. It will also create the config directory and optionally put in an example config using the RSS feed from Phoronix (the Linux news site). This will likely need some tweaking in the future. --- install.sh | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 install.sh diff --git a/install.sh b/install.sh new file mode 100644 index 0000000..59b3ea1 --- /dev/null +++ b/install.sh @@ -0,0 +1,79 @@ +#!/bin/bash + +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +# This script will set up a basic systemd service and timer for DiscoRSS +# You can optionally edit the entries here before running it, or you can +# use systemctl --user edit --full discorss.service or discorss.timer +# after installing them. + +cat << EOF > discorss.service +[Unit] +Description=Discord RSS feeder +Wants=discorss.timer + +[Service] +Type=oneshot +ExecStart=/home/amr/workspace/python/discorss/discorss.py + +[Install] +WantedBy=default.target + +EOF + +cat << EOF > discorss.timer +[Unit] +Description=Timer for DiscoRSS +Requires=discorss.service + +[Timer] +Unit=discorss.service +OnCalendar=*:0/5:00 +AccuracySec=1s + +[Install] +WantedBy=timers.target + +EOF + +cp discorss.service ~/.config/systemd/user/ +cp discorss.timer ~/.config/systemd/user/ + +systemctl --user daemon-reload + +printf "Would you like a basic example config created for you? [y/n]" +read answer1 +if [ "$answer1" =~ ^[yYnN]$ ]; then + mkdir -p -v ~/.config/discorss + cat << EOF > ~/.config/discorss/discorss.conf +{ + "feeds": [ + { + "name": "Phoronix", + "siteurl": "https://www.phoronix.com/", + "url": "http://www.phoronix.com/rss.php", + "webhook": "PASTE WEBHOOK URL HERE", + "offset": 0, + } + ], +} +EOF + printf "Make sure to edit ~/.config/discorss/discorss.conf and add in your custom feeds and webhook URLS! The script will just error out if you don't do this." +else + printf "Make sure to create a config at ~/.config/discorss/discorss.conf and follow the pattern shown in the README." +fi + +printf "Would you like to have the timer enabled and started now? [y/n]" +read answer +if [ "$answer" =~ ^[yYnN]$ ]; then + systemctl --user enable --now discorss.timer + printf "discorss.timer enabled and started. Don't enable or start discorss.service -- the timer does this automatically." +else + printf "Don't forget to run systemctl --user enable --now discorss.timer when you are ready! Don't enable or start discorss.service -- the timer does this automatically." +fi + +printf "You should be almost ready to go! Double-check your config files, and check systemctl --user list-timers once the discorss.timer is enabled to see when it will fire next. The default is every 5 minutes." + +printf "Remember, if you need help or encounter any bugs, contact me via the issues tracker on the git repository where you got this from!" From f70b18040a07fa69b3487840f31eecb465610512 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Sat, 19 Apr 2025 09:04:42 -0400 Subject: [PATCH 34/44] fix: wrapped hash in try/except to detect empty feeds Also changed file mode of install.sh to +x --- discorss.py | 18 ++++++++++++------ install.sh | 0 2 files changed, 12 insertions(+), 6 deletions(-) mode change 100644 => 100755 install.sh diff --git a/discorss.py b/discorss.py index 56d5d16..403b864 100755 --- a/discorss.py +++ b/discorss.py @@ -19,6 +19,7 @@ import json import time import os import sys +import argparse import re config_dir = os.environ.get("XDG_CONFIG_HOME") @@ -144,9 +145,13 @@ def main(): ) # Hash the title and time of the latest post and use that to determine if it's been posted # Yes, SHA3-512 is totally unnecessary for this purpose, but I love SHA3 - new_hash = hashlib.sha3_512( - bytes(latest_post["title"] + str(published_time), "utf-8") - ).hexdigest() + try: + new_hash = hashlib.sha3_512( + bytes(latest_post["title"] + str(published_time), "utf-8") + ).hexdigest() + except TypeError: + logger.error("Title of %s isn't hashing correctly", hook["name"]) + continue try: if hook["lasthash"] != new_hash: app_config["feeds"][i]["lasthash"] = new_hash @@ -169,10 +174,10 @@ def main(): { "title": str(latest_post["title"]), "url": str(latest_post["link"]), - "color": 216128, + "color": 2123412, "footer": { - "name": "DiscoRSS", - # "url": "https://git.frzn.dev/amr/discorss", + "text": "DiscoRSS", + "icon_url": "https://frzn.dev/~amr/images/discorss.png", }, "author": { "name": str(hook["name"]), @@ -184,6 +189,7 @@ def main(): "value": get_description(latest_post), } ], + # "timestamp": str(now), } ], "attachments": [], diff --git a/install.sh b/install.sh old mode 100644 new mode 100755 From af51c317e28ee46d3707f6ad86a959dbe29f14cd Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Sat, 19 Apr 2025 12:45:07 -0400 Subject: [PATCH 35/44] bug: trying to track down lock-up bug For some reason the script seems to be occasionally locking up, and then because the systemd service state is stuck in "starting" it never finishes which means the timer never gets reset. Adding some debug statements to try and figure out the cause. Also changed logging to DEBUG level. I'd much rather fix the bug but a timeout would also solve the issue. --- discorss.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/discorss.py b/discorss.py index 403b864..ab49694 100755 --- a/discorss.py +++ b/discorss.py @@ -104,7 +104,7 @@ def setupPaths(): logging.basicConfig( filename=str(log_dir + log_file_path), encoding="utf-8", - level=logging.INFO, + level=logging.DEBUG, datefmt="%m/%d/%Y %H:%M:%S", format="%(asctime)s: %(levelname)s: %(message)s", ) @@ -200,6 +200,7 @@ def main(): } webhook_string = json.dumps(webhook) + logger.debug("About to run POST for %s", hook["name"]) r = requests.post(hook["webhook"], data=webhook_string, headers=custom_header) if r.status_code not in success_codes: logger.error( @@ -211,6 +212,7 @@ def main(): # End of feed loop # Dump updated config back to json file + logger.debug("Dumping config back to %s", str(config_file_path)) app_config["lastupdate"] = now with open(config_file_path, "w") as config_file: json.dump(app_config, config_file, indent=4) From a62a0fcdc310747172895766e10ea19eca673700 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Sun, 20 Apr 2025 13:30:43 -0400 Subject: [PATCH 36/44] debug: added more logging to narrow down issue --- discorss.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/discorss.py b/discorss.py index ab49694..76f243d 100755 --- a/discorss.py +++ b/discorss.py @@ -125,6 +125,7 @@ def main(): feeds = feedparser.parse(hook["url"]) latest_post = [] prev_best = 0 + logger.debug("About to sort through entries for feed %s ...", hook["name"]) for feed in feeds["entries"]: try: bad_time = False @@ -145,6 +146,7 @@ def main(): ) # Hash the title and time of the latest post and use that to determine if it's been posted # Yes, SHA3-512 is totally unnecessary for this purpose, but I love SHA3 + logger.debug("About to hash %s ...", latest_post["title"]) try: new_hash = hashlib.sha3_512( bytes(latest_post["title"] + str(published_time), "utf-8") From 087a6339c8adf4df3c23d298fe1cbddcbdf23eef Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Sun, 20 Apr 2025 16:06:16 -0400 Subject: [PATCH 37/44] class refactor: Squashed commit of the following: commit 597f3837244cce6f501cf16f96e2cac3a81c8ab2 Author: A.M. Rowsell Date: Sun Apr 20 16:04:00 2025 -0400 fix: typo in log setup, error when replacing w/self commit 810cbd6f3d32ca0c7f74ec90f17c0011661d4785 Author: A.M. Rowsell Date: Sun Apr 20 13:51:41 2025 -0400 refactor: transformed into class-based app --- discorss.py | 392 +++++++++++++++++++++++++++------------------------- 1 file changed, 204 insertions(+), 188 deletions(-) diff --git a/discorss.py b/discorss.py index 76f243d..d90c7fb 100755 --- a/discorss.py +++ b/discorss.py @@ -22,204 +22,220 @@ import sys import argparse import re -config_dir = os.environ.get("XDG_CONFIG_HOME") -home_dir = Path.home() -if config_dir is None: - config_file_path = str(home_dir) + "/.config/discorss/discorss.conf" - config_dir = str(home_dir) + "/.config/discorss" -else: - config_file_path = config_dir + r"/discorss/discorss.conf" -log_dir = r"/var/log/discorss" -log_file_path = r"/app.log" -# Yes, I know you "can't parse HTML with regex", but -# just watch me. -html_filter = re.compile(r"\<\/?([A-Za-z0-9 \:\.\-\/\"\=])*\>") -success_codes = [200, 201, 202, 203, 204, 205, 206] -app_config = {} -# IDEA: Consider making this into a class-based program -# This would solve a couple issues around global variables and generally -# make things a bit neater +class Discorss: + def __init__(self): + self.config_dir = os.environ.get("XDG_CONFIG_HOME") + home_dir = Path.home() + if self.config_dir is None: + self.config_file_path = str(home_dir) + "/.config/discorss/discorss.conf" + self.config_dir = str(home_dir) + "/.config/discorss" + else: + self.config_file_path = self.config_dir + r"/discorss/discorss.conf" + self.log_dir = r"/var/log/discorss" + self.log_file_path = r"/app.log" + # Yes, I know you "can't parse HTML with regex", but + # just watch me. + self.html_filter = re.compile(r"\<\/?([A-Za-z0-9 \:\.\-\/\"\=])*\>") + self.success_codes = [200, 201, 202, 203, 204, 205, 206] + self.app_config = {} - -# This function gets and formats the brief excerpt that goes in the embed -# Different feeds put summaries in different fields, so we pick the best -# one and limit it to 250 characters. -def get_description(feed, length=250, min_length=150, addons=None): - try: - temporary_string = str(feed["summary_detail"]["value"]) - temporary_string = html_filter.sub("", temporary_string) - while length > min_length: - if temporary_string[length - 1 : length] == " ": - break - else: - length -= 1 - except KeyError: - temporary_string = str(feed["description"]) - temporary_string = html_filter.sub("", temporary_string) - while length > min_length: - if temporary_string[length - 1 : length] == " ": - break - else: - length -= 1 - - desc = temporary_string[:length] - if addons is not None: - desc = desc + str(addons) - return desc - - -def setupPaths(): - global app_config - global logger - # Check for log and config files/paths, create empty directories if needed - # TODO: make this cleaner - if not Path(log_dir).exists(): - print("No log file path exists. Yark! We'll try and make {}...".format(log_dir)) + # This function gets and formats the brief excerpt that goes in the embed + # Different feeds put summaries in different fields, so we pick the best + # one and limit it to 250 characters. + def get_description(self, feed, length=250, min_length=150, addons=None): try: - Path(log_dir).mkdir(parents=True, exist_ok=True) - except FileExistsError: - print("The path {} already exists and is not a directory!".format(log_dir)) - if not Path(config_file_path).exists(): - print( - "No config file at {}! Snarf. We'll try and make {}...".format( - config_file_path, config_dir - ) - ) - try: - Path(config_dir).mkdir(parents=True, exist_ok=True) - except FileExistsError: + temporary_string = str(feed["summary_detail"]["value"]) + temporary_string = self.html_filter.sub("", temporary_string) + while length > min_length: + if temporary_string[length - 1 : length] == " ": + break + else: + length -= 1 + except KeyError: + temporary_string = str(feed["description"]) + temporary_string = self.html_filter.sub("", temporary_string) + while length > min_length: + if temporary_string[length - 1 : length] == " ": + break + else: + length -= 1 + + desc = temporary_string[:length] + if addons is not None: + desc = desc + str(addons) + return desc + + def setupPaths(self): + # Check for log and config files/paths, create empty directories if needed + # TODO: make this cleaner + if not Path(self.log_dir).exists(): print( - "The config dir {} already exists and is not a directory! Please fix manually. Quitting!".format( - config_dir + "No log file path exists. Yark! We'll try and make {}...".format( + self.log_dir ) ) - sys.exit(255) + try: + Path(self.log_dir).mkdir(parents=True, exist_ok=True) + except FileExistsError: + print( + "The path {} already exists and is not a directory!".format( + self.log_dir + ) + ) + if not Path(self.config_file_path).exists(): + print( + "No config file at {}! Snarf. We'll try and make {}...".format( + self.config_file_path, self.config_dir + ) + ) + try: + Path(self.config_dir).mkdir(parents=True, exist_ok=True) + except FileExistsError: + print( + "The config dir {} already exists and is not a directory! Please fix manually. Quitting!".format( + self.config_dir + ) + ) + sys.exit(255) + return + # Loading the config file + with open(self.config_file_path, "r") as config_file: + self.app_config = json.load(config_file) + # Set up logging + self.logger = logging.getLogger(__name__) + logging.basicConfig( + filename=str(self.log_dir + self.log_file_path), + encoding="utf-8", + level=logging.DEBUG, + datefmt="%m/%d/%Y %H:%M:%S", + format="%(asctime)s: %(levelname)s: %(message)s", + ) return - # Loading the config file - with open(config_file_path, "r") as config_file: - app_config = json.load(config_file) - # Set up logging - logger = logging.getLogger(__name__) - logging.basicConfig( - filename=str(log_dir + log_file_path), - encoding="utf-8", - level=logging.DEBUG, - datefmt="%m/%d/%Y %H:%M:%S", - format="%(asctime)s: %(levelname)s: %(message)s", - ) - return + + def process(self): + os.environ["TZ"] = "America/Toronto" + time.tzset() + now = time.mktime(time.localtime()) + self.setupPaths() # Handle the config and log paths + try: + last_check = self.app_config["lastupdate"] + except KeyError: + last_check = ( + now - 21600 + ) # first run, no lastupdate, check up to 6 hours ago + for i, hook in enumerate(self.app_config["feeds"]): # Feed loop start + self.logger.debug("Parsing feed %s...", hook["name"]) + self.feeds = feedparser.parse(hook["url"]) + self.latest_post = [] + prev_best = 0 + self.logger.debug( + "About to sort through entries for feed %s ...", hook["name"] + ) + for feed in self.feeds["entries"]: + try: + bad_time = False + published_time = time.mktime(feed["published_parsed"]) + published_time = published_time + hook["offset"] + except KeyError: + published_time = time.mktime(feed["updated_parsed"]) + bad_time = True + if published_time > prev_best: + latest_post = feed + prev_best = published_time + else: + continue + if bad_time is True: + self.logger.debug( + "Feed %s doesn't supply a published time, using updated time instead", + hook["name"], + ) + # Hash the title and time of the latest post and use that to determine if it's been posted + # Yes, SHA3-512 is totally unnecessary for this purpose, but I love SHA3 + self.logger.debug("About to hash %s ...", latest_post["title"]) + try: + new_hash = hashlib.sha3_512( + bytes(latest_post["title"] + str(published_time), "utf-8") + ).hexdigest() + except TypeError: + self.logger.error("Title of %s isn't hashing correctly", hook["name"]) + continue + try: + if hook["lasthash"] != new_hash: + self.app_config["feeds"][i]["lasthash"] = new_hash + else: + continue + except KeyError: + self.app_config["feeds"][i]["lasthash"] = new_hash + self.logger.info( + "Feed %s has no existing hash, likely a new feed!", hook["name"] + ) + # Generate the webhook + self.logger.info( + "Publishing webhook for %s. Last check was %d, now is %d", + hook["name"], + last_check, + now, + ) + webhook = { + "embeds": [ + { + "title": str(latest_post["title"]), + "url": str(latest_post["link"]), + "color": 2123412, + "footer": { + "text": "DiscoRSS", + "icon_url": "https://frzn.dev/~amr/images/discorss.png", + }, + "author": { + "name": str(hook["name"]), + "url": str(hook["siteurl"]), + }, + "fields": [ + { + "name": "Excerpt from post:", + "value": self.get_description(latest_post), + } + ], + # "timestamp": str(now), + } + ], + "attachments": [], + } + custom_header = { + "user-agent": "DiscoRSS (https://git.frzn.dev/amr/discorss, 0.2rc3)", + "content-type": "application/json", + } + webhook_string = json.dumps(webhook) + + self.logger.debug("About to run POST for %s", hook["name"]) + r = requests.post( + hook["webhook"], data=webhook_string, headers=custom_header + ) + if r.status_code not in self.success_codes: + self.logger.error( + "Error %d while trying to post %s", r.status_code, hook["name"] + ) + else: + self.logger.debug("Got %d when posting %s", r.status_code, hook["name"]) + + # End of feed loop + + # Dump updated config back to json file + self.logger.debug("Dumping config back to %s", str(self.config_file_path)) + self.app_config["lastupdate"] = now + with open(self.config_file_path, "w") as config_file: + json.dump(self.app_config, config_file, indent=4) + + return + + +# end of Discorss class def main(): - os.environ["TZ"] = "America/Toronto" - time.tzset() - now = time.mktime(time.localtime()) - setupPaths() # Handle the config and log paths - try: - last_check = app_config["lastupdate"] - except KeyError: - last_check = now - 21600 # first run, no lastupdate, check up to 6 hours ago - for i, hook in enumerate(app_config["feeds"]): # Feed loop start - logger.debug("Parsing feed %s...", hook["name"]) - feeds = feedparser.parse(hook["url"]) - latest_post = [] - prev_best = 0 - logger.debug("About to sort through entries for feed %s ...", hook["name"]) - for feed in feeds["entries"]: - try: - bad_time = False - published_time = time.mktime(feed["published_parsed"]) - published_time = published_time + hook["offset"] - except KeyError: - published_time = time.mktime(feed["updated_parsed"]) - bad_time = True - if published_time > prev_best: - latest_post = feed - prev_best = published_time - else: - continue - if bad_time is True: - logger.debug( - "Feed %s doesn't supply a published time, using updated time instead", - hook["name"], - ) - # Hash the title and time of the latest post and use that to determine if it's been posted - # Yes, SHA3-512 is totally unnecessary for this purpose, but I love SHA3 - logger.debug("About to hash %s ...", latest_post["title"]) - try: - new_hash = hashlib.sha3_512( - bytes(latest_post["title"] + str(published_time), "utf-8") - ).hexdigest() - except TypeError: - logger.error("Title of %s isn't hashing correctly", hook["name"]) - continue - try: - if hook["lasthash"] != new_hash: - app_config["feeds"][i]["lasthash"] = new_hash - else: - continue - except KeyError: - app_config["feeds"][i]["lasthash"] = new_hash - logger.info( - "Feed %s has no existing hash, likely a new feed!", hook["name"] - ) - # Generate the webhook - logger.info( - "Publishing webhook for %s. Last check was %d, now is %d", - hook["name"], - last_check, - now, - ) - webhook = { - "embeds": [ - { - "title": str(latest_post["title"]), - "url": str(latest_post["link"]), - "color": 2123412, - "footer": { - "text": "DiscoRSS", - "icon_url": "https://frzn.dev/~amr/images/discorss.png", - }, - "author": { - "name": str(hook["name"]), - "url": str(hook["siteurl"]), - }, - "fields": [ - { - "name": "Excerpt from post:", - "value": get_description(latest_post), - } - ], - # "timestamp": str(now), - } - ], - "attachments": [], - } - custom_header = { - "user-agent": "DiscoRSS (https://git.frzn.dev/amr/discorss, 0.2rc3)", - "content-type": "application/json", - } - webhook_string = json.dumps(webhook) - - logger.debug("About to run POST for %s", hook["name"]) - r = requests.post(hook["webhook"], data=webhook_string, headers=custom_header) - if r.status_code not in success_codes: - logger.error( - "Error %d while trying to post %s", r.status_code, hook["name"] - ) - else: - logger.debug("Got %d when posting %s", r.status_code, hook["name"]) - - # End of feed loop - - # Dump updated config back to json file - logger.debug("Dumping config back to %s", str(config_file_path)) - app_config["lastupdate"] = now - with open(config_file_path, "w") as config_file: - json.dump(app_config, config_file, indent=4) - - return + app = Discorss() + app.process() if __name__ == "__main__": From 2b4e4216f4c285fdb3cf467e6238339f78846e10 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Sun, 20 Apr 2025 16:10:08 -0400 Subject: [PATCH 38/44] release: bump version number to 0.2 --- discorss.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/discorss.py b/discorss.py index d90c7fb..99b4398 100755 --- a/discorss.py +++ b/discorss.py @@ -203,7 +203,7 @@ class Discorss: "attachments": [], } custom_header = { - "user-agent": "DiscoRSS (https://git.frzn.dev/amr/discorss, 0.2rc3)", + "user-agent": "DiscoRSS (https://git.frzn.dev/amr/discorss, 0.2)", "content-type": "application/json", } webhook_string = json.dumps(webhook) From 2e18ede6a84c18a6b895d6314fba16e7c5af8b26 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Mon, 21 Apr 2025 19:29:56 -0400 Subject: [PATCH 39/44] docs/fix: Updated README.md and install.sh --- README.md | 20 +++++++++++++++----- install.sh | 1 + 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 06b28fe..48148d4 100644 --- a/README.md +++ b/README.md @@ -15,13 +15,15 @@ The remaining imports should all be part of the standard Python install. ## Important Notes -As it currently is written, the script uses the hash of the post title to prevent sending duplicates. However, a recent change to check for the publish time was added, only because some feeds are not in reverse chronological order (latest post at top of feed, ie, entry index 0). Because of this, we do actually need to check the publish times. This still needs some testing and things might be a bit broken because of it. If you see any issues please let me know. - -Logging was recently enabled. Make sure that the user running the script (especially when using systemd timers) has write access to the /var/log/discorss directory. The app will try and create the directory for you, but if your user doesn't have permissions to create directories in /var/log this will fail and this will probably crash the script as is. I will try and remember to catch that exception and exit gracefully with an error message to stdout. If you want the logs to go somewhere else, just edit the log_dir variable near the top of discorss.py. Choose a directory that makes sense. Unfortunately, as far as I know, the XDG standards don't have an equivalent to the /var/log directory in the user directory, so I wasn't sure what the best default was. In the future, we may switch to logging using systemd and journald directly, though it is nice to have a separate file. +The logger will try and put the logs in `/var/log/discorss`. Make sure to create this directory and give the user running the script write permissions there. If you want the logs to go somewhere else, just edit the log_dir variable near the top of discorss.py. Choose a directory that makes sense. Unfortunately, as far as I know, the XDG standards don't have an equivalent to the /var/log directory in the user directory, so I wasn't sure what the best default was. In the future, we may switch to logging using systemd and journald directly, though it is nice to have a separate file. ## How to setup -To configure the script, create ~/.config/discorss/discorss.conf with the following structure: +Note: see the Automation section below for info about using the `install.sh` script to help get all the files in the right places. + +### Config file format + +To configure the script, create `~/.config/discorss/discorss.conf` using JSON formatting like this: ```json { @@ -50,6 +52,13 @@ The offset should only be required if feeds aren't showing up. This is because f ## Automation +**New**: There is now `install.sh` in the repo which will automatically help you set up both the config file and the systemd unit files for the service and timer, using essentially the exact text below. It will copy them to the user systemd unit folder, `~/.config/systemd/user` and optionally enable the timer. It's a good idea to edit the configuration file at `~/.config/discorss/discorss.conf` and paste in your webhook URLs and add any other feeds you want before starting the timer, unless you can do it really quickly before the next 5 minute spot on the clock :) +Of course, if it fires with an invalid config, the script will just crash, and you'll probably just have to manually start the timer once the config is fixed, so not a big deal. + +_Remember to create `/var/log/discorss` and change it to be writeable by the user running the service!_ + +### Manual method + To automate feed posting, create a systemd service and timer to execute the script. Use the command `systemctl --user edit --full --force discorss.service` and then paste in something like this: @@ -61,13 +70,14 @@ Wants=discorss.timer [Service] Type=oneshot +TimeoutStartSec=120 ExecStart=/path/to/discorss.py [Install] WantedBy=default.target ``` -Make sure to edit the ExecStart to point to the correct location. Then we need a systemd timer to automatically fire the script. Run `systemctl --user edit --full --force discorss.timer` and then paste in this: +The TimeoutStartSec will catch any issues with the script locking up due to, e.g., DNS failures or RSS feeds being slow/unavailable. 2 minutes should be more than enough time unless you are running hundreds of feeds. Also make sure to edit the ExecStart to point to the correct location. Then we need a systemd timer to automatically fire the script. Run `systemctl --user edit --full --force discorss.timer` and then paste in this: ```systemd [Unit] Description=Timer for DiscoRSS diff --git a/install.sh b/install.sh index 59b3ea1..f69c1a2 100755 --- a/install.sh +++ b/install.sh @@ -16,6 +16,7 @@ Wants=discorss.timer [Service] Type=oneshot +TimeoutStartSec=120 ExecStart=/home/amr/workspace/python/discorss/discorss.py [Install] From e4539b5733c8bdc499398b4388b544587c989912 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Tue, 22 Apr 2025 02:11:01 -0400 Subject: [PATCH 40/44] chore: rename function, move some init code Also switching logging back to ERROR from DEBUG. The solution to the lockups for now is to just use systemd timer timeouts. --- discorss.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/discorss.py b/discorss.py index 99b4398..9d001ce 100755 --- a/discorss.py +++ b/discorss.py @@ -66,7 +66,10 @@ class Discorss: desc = desc + str(addons) return desc - def setupPaths(self): + def setup(self): + os.environ["TZ"] = "America/Toronto" + time.tzset() + self.now = time.mktime(time.localtime()) # Check for log and config files/paths, create empty directories if needed # TODO: make this cleaner if not Path(self.log_dir).exists(): @@ -107,22 +110,19 @@ class Discorss: logging.basicConfig( filename=str(self.log_dir + self.log_file_path), encoding="utf-8", - level=logging.DEBUG, + level=logging.ERROR, datefmt="%m/%d/%Y %H:%M:%S", format="%(asctime)s: %(levelname)s: %(message)s", ) return def process(self): - os.environ["TZ"] = "America/Toronto" - time.tzset() - now = time.mktime(time.localtime()) - self.setupPaths() # Handle the config and log paths + self.setup() # Handle the config and log paths try: last_check = self.app_config["lastupdate"] except KeyError: last_check = ( - now - 21600 + self.now - 21600 ) # first run, no lastupdate, check up to 6 hours ago for i, hook in enumerate(self.app_config["feeds"]): # Feed loop start self.logger.debug("Parsing feed %s...", hook["name"]) @@ -172,10 +172,10 @@ class Discorss: ) # Generate the webhook self.logger.info( - "Publishing webhook for %s. Last check was %d, now is %d", + "Publishing webhook for %s. Last check was %d, self.now is %d", hook["name"], last_check, - now, + self.now, ) webhook = { "embeds": [ @@ -197,7 +197,7 @@ class Discorss: "value": self.get_description(latest_post), } ], - # "timestamp": str(now), + # "timestamp": str(self.now), } ], "attachments": [], @@ -223,7 +223,7 @@ class Discorss: # Dump updated config back to json file self.logger.debug("Dumping config back to %s", str(self.config_file_path)) - self.app_config["lastupdate"] = now + self.app_config["lastupdate"] = self.now with open(self.config_file_path, "w") as config_file: json.dump(self.app_config, config_file, indent=4) From b0f08c405bc78d77028bc5f3c9a5240f513adeba Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Tue, 22 Apr 2025 03:17:16 -0400 Subject: [PATCH 41/44] fix: install.sh works properly now (with colour!) --- install.sh | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/install.sh b/install.sh index f69c1a2..3c32200 100755 --- a/install.sh +++ b/install.sh @@ -9,6 +9,8 @@ # use systemctl --user edit --full discorss.service or discorss.timer # after installing them. +printf "\e[1;34mDisco\e[1;38;5;208mRSS\e[0m Install Helper Script\n\n" + cat << EOF > discorss.service [Unit] Description=Discord RSS feeder @@ -39,14 +41,16 @@ WantedBy=timers.target EOF +mkdir -p ~/.config/systemd/user/ + cp discorss.service ~/.config/systemd/user/ cp discorss.timer ~/.config/systemd/user/ systemctl --user daemon-reload -printf "Would you like a basic example config created for you? [y/n]" +printf "Would you like a basic example config created for you? [y/n]: " read answer1 -if [ "$answer1" =~ ^[yYnN]$ ]; then +if [[ "$answer1" =~ ^([yY])$ ]]; then mkdir -p -v ~/.config/discorss cat << EOF > ~/.config/discorss/discorss.conf { @@ -61,20 +65,20 @@ if [ "$answer1" =~ ^[yYnN]$ ]; then ], } EOF - printf "Make sure to edit ~/.config/discorss/discorss.conf and add in your custom feeds and webhook URLS! The script will just error out if you don't do this." + printf "\nMake sure to edit \e[1;34m~/.config/discorss/discorss.conf\e[0m and add in your custom feeds and webhook URLS! The script will just error out if you don't do this." else - printf "Make sure to create a config at ~/.config/discorss/discorss.conf and follow the pattern shown in the README." + printf "\nMake sure to create a config at \e[1;34m~/.config/discorss/discorss.conf\e[0m and follow the pattern shown in the README." fi -printf "Would you like to have the timer enabled and started now? [y/n]" +printf "\nWould you like to have the timer enabled and started now? [y/n]: " read answer -if [ "$answer" =~ ^[yYnN]$ ]; then +if [[ "$answer" =~ ^([yY])$ ]]; then systemctl --user enable --now discorss.timer - printf "discorss.timer enabled and started. Don't enable or start discorss.service -- the timer does this automatically." + printf "\ndiscorss.timer enabled and started. \e[1;31mDon't enable or start discorss.service\e[0m -- the timer does this automatically." else - printf "Don't forget to run systemctl --user enable --now discorss.timer when you are ready! Don't enable or start discorss.service -- the timer does this automatically." + printf "\nDon't forget to run \e[1;32msystemctl --user enable --now discorss.timer\e[0m when you are ready! \e[1;31mDon't enable or start discorss.service\e[0m -- the timer does this automatically." fi -printf "You should be almost ready to go! Double-check your config files, and check systemctl --user list-timers once the discorss.timer is enabled to see when it will fire next. The default is every 5 minutes." +printf "\n\nYou should be almost ready to go! Double-check your config files, and check \e[1;32msystemctl --user list-timers\e[0m once the discorss.timer is enabled to see when it will fire next. The default is every 5 minutes." -printf "Remember, if you need help or encounter any bugs, contact me via the issues tracker on the git repository where you got this from!" +printf "\nRemember, if you need help or encounter any bugs, contact me via the issues tracker on the git repository where you got this from!" From 9d2530ab02803d27be53dc55111f78fbaed0f192 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Tue, 22 Apr 2025 04:26:49 -0400 Subject: [PATCH 42/44] fix: corrected errors in install.sh Also improved the script to actually use the script location in the discorss.service file... yeah I should have done that from the start, d'oh! --- install.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/install.sh b/install.sh index 3c32200..d3ed185 100755 --- a/install.sh +++ b/install.sh @@ -11,6 +11,8 @@ printf "\e[1;34mDisco\e[1;38;5;208mRSS\e[0m Install Helper Script\n\n" +workingDir=$(pwd) + cat << EOF > discorss.service [Unit] Description=Discord RSS feeder @@ -19,7 +21,7 @@ Wants=discorss.timer [Service] Type=oneshot TimeoutStartSec=120 -ExecStart=/home/amr/workspace/python/discorss/discorss.py +ExecStart=$workingDir/discorss.py [Install] WantedBy=default.target @@ -45,7 +47,8 @@ mkdir -p ~/.config/systemd/user/ cp discorss.service ~/.config/systemd/user/ cp discorss.timer ~/.config/systemd/user/ - +rm -f discorss.service +rm -f discorss.timer systemctl --user daemon-reload printf "Would you like a basic example config created for you? [y/n]: " @@ -60,9 +63,9 @@ if [[ "$answer1" =~ ^([yY])$ ]]; then "siteurl": "https://www.phoronix.com/", "url": "http://www.phoronix.com/rss.php", "webhook": "PASTE WEBHOOK URL HERE", - "offset": 0, + "offset": 0 } - ], + ] } EOF printf "\nMake sure to edit \e[1;34m~/.config/discorss/discorss.conf\e[0m and add in your custom feeds and webhook URLS! The script will just error out if you don't do this." From b243bc7bb4991f85eab236fd7e9c6486bba39051 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Tue, 22 Apr 2025 17:29:45 -0400 Subject: [PATCH 43/44] dev: Added more prompts to install script --- install.sh | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/install.sh b/install.sh index d3ed185..7a89398 100755 --- a/install.sh +++ b/install.sh @@ -13,7 +13,12 @@ printf "\e[1;34mDisco\e[1;38;5;208mRSS\e[0m Install Helper Script\n\n" workingDir=$(pwd) -cat << EOF > discorss.service +printf "Would you like the systemd service and timer files created for you? [y/n]: " +read answer +if [[ "$answer" =~ ^([yY])$ ]]; then + + cat << EOF > discorss.service +# Autogenerated by install.sh [Unit] Description=Discord RSS feeder Wants=discorss.timer @@ -28,7 +33,8 @@ WantedBy=default.target EOF -cat << EOF > discorss.timer + cat << EOF > discorss.timer +# Autogenerated by install.sh [Unit] Description=Timer for DiscoRSS Requires=discorss.service @@ -43,13 +49,18 @@ WantedBy=timers.target EOF -mkdir -p ~/.config/systemd/user/ - -cp discorss.service ~/.config/systemd/user/ -cp discorss.timer ~/.config/systemd/user/ -rm -f discorss.service -rm -f discorss.timer -systemctl --user daemon-reload + printf "Making ~/.config/systemd/user in case it doesn't exist ...\n" + mkdir -p -v ~/.config/systemd/user/ + printf "Copying service and timer files there ... \n" + cp discorss.service ~/.config/systemd/user/ + cp discorss.timer ~/.config/systemd/user/ + rm -f discorss.service + rm -f discorss.timer + printf "Reloading systemd daemon ... \n\n" + systemctl --user daemon-reload +else + printf "This script is intended to be automatically run. It's designed with systemd in mind, but you are free to use any automation tools. You can look at this script for examples of how to structure systemd user services and timers.\nOf course, you could always run it by hand, if you really want to :)\n\n" +fi printf "Would you like a basic example config created for you? [y/n]: " read answer1 @@ -84,4 +95,4 @@ fi printf "\n\nYou should be almost ready to go! Double-check your config files, and check \e[1;32msystemctl --user list-timers\e[0m once the discorss.timer is enabled to see when it will fire next. The default is every 5 minutes." -printf "\nRemember, if you need help or encounter any bugs, contact me via the issues tracker on the git repository where you got this from!" +printf "\nRemember, if you need help or encounter any bugs, contact me via the issues tracker on the git repository where you got this from!\n" From 1787d4da993beb0cfe5872360833637d1d224652 Mon Sep 17 00:00:00 2001 From: "A.M. Rowsell" Date: Fri, 25 Apr 2025 23:59:24 -0400 Subject: [PATCH 44/44] fix: bail install if on a non-systemd machine Instead suggest using cron. At some point I can probably write up some simple instructions for a basic cron setup. --- install.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/install.sh b/install.sh index 7a89398..82a5fab 100755 --- a/install.sh +++ b/install.sh @@ -13,6 +13,14 @@ printf "\e[1;34mDisco\e[1;38;5;208mRSS\e[0m Install Helper Script\n\n" workingDir=$(pwd) +# bail if we're on a non-systemd system, suggest cron +if [[ -d /run/systemd/system ]]; then + printf "systemd detected..." +else + printf "This script and DiscoRSS in general are optimized for systemd! You can use cron as a substitute but I haven't written any documentation for it, so you're on your own for now!" + exit 127 # command not found exit code +fi + printf "Would you like the systemd service and timer files created for you? [y/n]: " read answer if [[ "$answer" =~ ^([yY])$ ]]; then