Compare commits

...

3 commits

Author SHA1 Message Date
dd6553a6f1
Updates to README to explain recent changes and logging 2025-02-25 21:02:59 -05:00
0a22cfe4ee
A few small cleanup items 2025-02-25 20:57:40 -05:00
a263f5cb93
FEATURE: Added logging, finally!
Now the log_dir and log_file_path actually do something useful.
2025-02-25 20:56:37 -05:00
2 changed files with 51 additions and 24 deletions

View file

@ -11,7 +11,13 @@ requests >= 2.4.2
feedparser
```
The remaining imports should all be part of the standard Python install.
The remaining imports should all be part of the standard Python install.
## Important Notes
As it currently is written, the script uses the hash of the post title to prevent sending duplicates. However, a recent change to check for the publish time was added, only because some feeds are not in reverse chronological order (latest post at top of feed, ie, entry index 0). Because of this, we do actually need to check the publish times. This still needs some testing and things might be a bit broken because of it. If you see any issues please let me know.
Logging was recently enabled. Make sure that the user running the script (especially when using systemd timers) has write access to the /var/log/discorss directory. The app will try and create the directory for you, but if your user doesn't have permissions to create directories in /var/log this will fail and this will probably crash the script as is. I will try and remember to catch that exception and exit gracefully with an error message to stdout. If you want the logs to go somewhere else, just edit the log_dir variable near the top of discorss.py. Choose a directory that makes sense. Unfortunately, as far as I know, the XDG standards don't have an equivalent to the /var/log directory in the user directory, so I wasn't sure what the best default was. In the future, we may switch to logging using systemd and journald directly, though it is nice to have a separate file.
## How to setup
@ -40,7 +46,7 @@ To configure the script, create /etc/discorss.conf with the following structure:
Create a webhook for each feed (unless you want them all to show as the same webhook for whatever reason) and make sure to add it in to the config. I have it set up with a webhook for each site, each with the site's icon and name set for the webhook which makes the messages look really nice.
The offset should only be required if feeds aren't showing up. This is because feedparser, in its infinite wisdom, just ignores the timezone when converting publish dates from feeds. So most feeds end up with an epoch in UTC. The offset should be the number of seconds between your time zone and UTC. This will eventually be fixed in a future update, I just need to sit down and wrangle with feedparser and datetime some more. All fields are mandatory, if you want to have no offset for example, set it to 0. The name and siteurl are used to create the "author" field in the Discord embed.
The offset should only be required if feeds aren't showing up. This is because feedparser, in its infinite wisdom, just ignores the timezone when converting publish dates from feeds. So most feeds end up with an epoch in UTC. The offset should be the number of seconds between your time zone and UTC. This will eventually be fixed in a future update, I just need to sit down and wrangle with feedparser and datetime some more. All fields are mandatory, if you want to have no offset for example, set it to 0. The name and siteurl are used to create the "author" field in the Discord embed.
## Automation

View file

@ -13,6 +13,7 @@
import requests
import feedparser
import hashlib
import logging
from pathlib import Path
import json
import time
@ -42,13 +43,13 @@ app_config = {}
# TODO: make the character limit smarter, as to split at a natural point
def get_description(feed):
try:
temporary_string = str(feed.entries[0]["summary_detail"]["value"])
temporary_string = str(feed["summary_detail"]["value"])
temporary_string = html_filter.sub("", temporary_string)
desc = (
temporary_string[:250] if len(temporary_string) > 250 else temporary_string
)
except KeyError:
temporary_string = str(feed.entries[0]["description"])
temporary_string = str(feed["description"])
temporary_string = html_filter.sub("", temporary_string)
desc = (
temporary_string[:250] if len(temporary_string) > 250 else temporary_string
@ -58,10 +59,11 @@ def get_description(feed):
def setupPaths():
global app_config
global logger
# Check for log and config files/paths, create empty directories if needed
# TODO: make this cleaner
if not Path(log_file_path).exists():
print("No log file path exists. Yark! We'll try and make {}...", log_dir)
if not Path(log_dir).exists():
print("No log file path exists. Yark! We'll try and make {}...".format(log_dir))
try:
Path(log_dir).mkdir(parents=True, exist_ok=True)
except FileExistsError:
@ -85,6 +87,15 @@ def setupPaths():
# Loading the config file
with open(config_file_path, "r") as config_file:
app_config = json.load(config_file)
# Set up logging
logger = logging.getLogger(__name__)
logging.basicConfig(
filename=str(log_dir + log_file_path),
encoding="utf-8",
level=logging.INFO,
datefmt="%m/%d/%Y %H:%M:%S",
format="%(asctime)s: %(levelname)s: %(message)s",
)
return
@ -99,24 +110,28 @@ def main():
last_check = now - 21600 # first run, no lastupdate, check up to 6 hours ago
for i, hook in enumerate(app_config["feeds"]):
# Get the feed
print("Parsing feed {}...".format(hook["name"]))
logger.info("Parsing feed %s...", hook["name"])
feeds = feedparser.parse(hook["url"])
latest_post = []
prev_best = 0
for feed in feeds:
for feed in feeds["entries"]:
try:
bad_time = False
published_time = time.mktime(feed["published_parsed"])
published_time = published_time + hook["offset"]
except KeyError:
published_time = feed["published"]
print(published_time)
sys.exit(254)
published_time = time.mktime(feed["updated_parsed"])
bad_time = True
if published_time > prev_best:
latest_post = feed
prev_best = published_time
else:
continue
if bad_time is True:
logger.warning(
"Feed %s doesn't supply a published time, using updated time instead",
hook["name"],
)
# Hash the title of the latest post and use that to determine if it's been posted
new_hash = hashlib.sha3_512(bytes(latest_post["title"], "utf-8")).hexdigest()
try:
@ -126,7 +141,16 @@ def main():
continue
except KeyError:
app_config["feeds"][i]["lasthash"] = new_hash
logger.info(
"Feed %s has no existing hash, likely a new feed!", hook["name"]
)
# Generate the webhook
logger.info(
"Publishing webhook for %s. Last check was %d, now is %d",
hook["name"],
last_check,
now,
)
webhook = {
"embeds": [
{
@ -157,19 +181,16 @@ def main():
}
webhook_string = json.dumps(webhook)
if published_time > last_check:
r = requests.post(
hook["webhook"], data=webhook_string, headers=custom_header
r = requests.post(hook["webhook"], data=webhook_string, headers=custom_header)
if r.status_code not in success_codes:
logger.error(
"Error %d while trying to post %s", r.status_code, hook["webhook"]
)
if r.status_code not in success_codes:
print(
"Error {} while trying to post {}".format(
r.status_code, hook["webhook"]
)
)
app_config["lastupdate"] = now
with open(config_file_path, "w") as config_file:
json.dump(app_config, config_file, indent=4)
# End of feed loop
app_config["lastupdate"] = now
with open(config_file_path, "w") as config_file:
json.dump(app_config, config_file, indent=4)
return