Compare commits
17 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
64111aca16 |
|||
|
cd07e9806f |
|||
|
84c0b40edf |
|||
|
91c39042c8 |
|||
|
acb03cff27 |
|||
|
0e8dba9d6d |
|||
|
e5a5f4f7d2 |
|||
|
a34cc1ac8e |
|||
|
4128e7808c |
|||
|
9c81fb0c81 |
|||
|
85d13b9309 |
|||
|
cdd7a2569f |
|||
|
63300e6012 |
|||
|
98d1a3ba45 |
|||
|
5a3e3333b3 |
|||
|
d412c1a378 |
|||
|
02aa1aa11b |
3 changed files with 441 additions and 125 deletions
177
.gitignore
vendored
177
.gitignore
vendored
|
|
@ -4,4 +4,179 @@ log/
|
||||||
*.bak
|
*.bak
|
||||||
bin/
|
bin/
|
||||||
lib/
|
lib/
|
||||||
*.cfg
|
*.cfg# Created by https://www.toptal.com/developers/gitignore/api/python
|
||||||
|
# Edit at https://www.toptal.com/developers/gitignore?templates=python
|
||||||
|
|
||||||
|
### Python ###
|
||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
share/python-wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.nox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
*.py,cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
cover/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
db.sqlite3-journal
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
.pybuilder/
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# IPython
|
||||||
|
profile_default/
|
||||||
|
ipython_config.py
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
|
# .python-version
|
||||||
|
|
||||||
|
# pipenv
|
||||||
|
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||||
|
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||||
|
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||||
|
# install all needed dependencies.
|
||||||
|
#Pipfile.lock
|
||||||
|
|
||||||
|
# poetry
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||||
|
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||||
|
# commonly ignored for libraries.
|
||||||
|
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||||
|
#poetry.lock
|
||||||
|
|
||||||
|
# pdm
|
||||||
|
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||||
|
#pdm.lock
|
||||||
|
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||||
|
# in version control.
|
||||||
|
# https://pdm.fming.dev/#use-with-ide
|
||||||
|
.pdm.toml
|
||||||
|
|
||||||
|
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||||
|
__pypackages__/
|
||||||
|
|
||||||
|
# Celery stuff
|
||||||
|
celerybeat-schedule
|
||||||
|
celerybeat.pid
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
.dmypy.json
|
||||||
|
dmypy.json
|
||||||
|
|
||||||
|
# Pyre type checker
|
||||||
|
.pyre/
|
||||||
|
|
||||||
|
# pytype static type analyzer
|
||||||
|
.pytype/
|
||||||
|
|
||||||
|
# Cython debug symbols
|
||||||
|
cython_debug/
|
||||||
|
|
||||||
|
# PyCharm
|
||||||
|
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||||
|
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||||
|
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||||
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
|
#.idea/
|
||||||
|
|
||||||
|
### Python Patch ###
|
||||||
|
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
||||||
|
poetry.toml
|
||||||
|
|
||||||
|
# ruff
|
||||||
|
.ruff_cache/
|
||||||
|
|
||||||
|
# LSP config files
|
||||||
|
pyrightconfig.json
|
||||||
|
|
||||||
|
# End of https://www.toptal.com/developers/gitignore/api/python
|
||||||
|
|
|
||||||
78
README.md
78
README.md
|
|
@ -11,46 +11,27 @@ requests >= 2.4.2
|
||||||
feedparser
|
feedparser
|
||||||
```
|
```
|
||||||
|
|
||||||
The remaining imports should all be part of the standard Python install.
|
The remaining imports should all be part of the standard Python install: hashlib, logging, asyncio, pathlib, json, time, os, sys, argparse, re, types. To install the required ones, use your distro's package manager -- don't use pip unless you want to run the entire script in a virtualenv (which would probably make the systemd unit file a bit more complex, I think you'd have to add a PreExec to activate the virtualenv and a PostExec to disable it?).
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<img src="https://frzn.dev/~amr/images/Screenshot_224228_1.png">
|
||||||
|
</p>
|
||||||
|
|
||||||
## Important Notes
|
## Important Notes
|
||||||
|
|
||||||
The logger will try and put the logs in `/var/log/discorss`. Make sure to create this directory and give the user running the script write permissions there. If you want the logs to go somewhere else, just edit the log_dir variable near the top of discorss.py. Choose a directory that makes sense. Unfortunately, as far as I know, the XDG standards don't have an equivalent to the /var/log directory in the user directory, so I wasn't sure what the best default was. In the future, we may switch to logging using systemd and journald directly, though it is nice to have a separate file.
|
By default, DiscoRSS will try and put the logs in `/var/log/discorss`. Make sure to create this directory and give the user running the script write permissions there. If you want the logs to go somewhere else, just give the path as an argument (shown below). Choose a directory that makes sense. Unfortunately, as far as I know, the XDG standards don't have an equivalent to the /var/log directory in the user directory, so I wasn't sure what the best default was. In the future, we may switch to logging using systemd and journald directly, though it is nice to have a separate file.
|
||||||
|
|
||||||
|
### Script Arguments
|
||||||
|
|
||||||
|
The script has a few different arguments that make it easy to customize certain things:
|
||||||
|
|
||||||
|
* `-d / --dry-run`: Just like it says on the tin -- run the script, pull feeds, but don't post anything to Discord
|
||||||
|
* `-c / --config-file`: Give a path to an alternate location for the config file. The default is ~/.config/discorss/discorss.conf and that should be fine for the vast majority of users.
|
||||||
|
* `-l / --log-file`: Give a path to where you want the log file stored. The default is /var/log/discorss/app.log but you will have to create the /var/log/discorss directory and make it writeable by whatever user will be running the script.
|
||||||
|
|
||||||
## How to setup
|
## How to setup
|
||||||
|
|
||||||
Note: see the Automation section below for info about using the `install.sh` script to help get all the files in the right places.
|
### Automation
|
||||||
|
|
||||||
### Config file format
|
|
||||||
|
|
||||||
To configure the script, create `~/.config/discorss/discorss.conf` using JSON formatting like this:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"feeds": [
|
|
||||||
{
|
|
||||||
"name": "Phoronix",
|
|
||||||
"siteurl": "https://www.phoronix.com/",
|
|
||||||
"url": "http://www.phoronix.com/rss.php",
|
|
||||||
"webhook": "webhook url",
|
|
||||||
"offset": -18000
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Pagetable",
|
|
||||||
"siteurl": "https://pagetable.com",
|
|
||||||
"url": "https://www.pagetable.com/?feed=rss2",
|
|
||||||
"webhook": "webhook url",
|
|
||||||
"offset": -18000
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Create a webhook for each feed (unless you want them all to show as the same webhook for whatever reason) and make sure to add it in to the config. I have it set up with a webhook for each site, each with the site's icon and name set for the webhook which makes the messages look really nice.
|
|
||||||
|
|
||||||
The offset should only be required if feeds aren't showing up. This is because feedparser, in its infinite wisdom, just ignores the timezone when converting publish dates from feeds. So most feeds end up with an epoch in UTC. The offset should be the number of seconds between your time zone and UTC. This will eventually be fixed in a future update, I just need to sit down and wrangle with feedparser and datetime some more. All fields are mandatory, if you want to have no offset for example, set it to 0. The name and siteurl are used to create the "author" field in the Discord embed.
|
|
||||||
|
|
||||||
## Automation
|
|
||||||
|
|
||||||
**New**: There is now `install.sh` in the repo which will automatically help you set up both the config file and the systemd unit files for the service and timer, using essentially the exact text below. It will copy them to the user systemd unit folder, `~/.config/systemd/user` and optionally enable the timer. It's a good idea to edit the configuration file at `~/.config/discorss/discorss.conf` and paste in your webhook URLs and add any other feeds you want before starting the timer, unless you can do it really quickly before the next 5 minute spot on the clock :)
|
**New**: There is now `install.sh` in the repo which will automatically help you set up both the config file and the systemd unit files for the service and timer, using essentially the exact text below. It will copy them to the user systemd unit folder, `~/.config/systemd/user` and optionally enable the timer. It's a good idea to edit the configuration file at `~/.config/discorss/discorss.conf` and paste in your webhook URLs and add any other feeds you want before starting the timer, unless you can do it really quickly before the next 5 minute spot on the clock :)
|
||||||
Of course, if it fires with an invalid config, the script will just crash, and you'll probably just have to manually start the timer once the config is fixed, so not a big deal.
|
Of course, if it fires with an invalid config, the script will just crash, and you'll probably just have to manually start the timer once the config is fixed, so not a big deal.
|
||||||
|
|
@ -94,6 +75,35 @@ WantedBy=timers.target
|
||||||
|
|
||||||
To change how often this fires, edit the OnCalendar parameter. The config above has it firing every 15 minutes at half past the minute. Look at the systemd timer man pages for help if you want to tweak it.
|
To change how often this fires, edit the OnCalendar parameter. The config above has it firing every 15 minutes at half past the minute. Look at the systemd timer man pages for help if you want to tweak it.
|
||||||
|
|
||||||
|
### Config file format
|
||||||
|
|
||||||
|
To configure the script, create `~/.config/discorss/discorss.conf` (or have install.sh create it for you) using JSON formatting like this:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"feeds": [
|
||||||
|
{
|
||||||
|
"name": "Phoronix",
|
||||||
|
"siteurl": "https://www.phoronix.com/",
|
||||||
|
"url": "http://www.phoronix.com/rss.php",
|
||||||
|
"webhook": "webhook url",
|
||||||
|
"offset": -18000
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Pagetable",
|
||||||
|
"siteurl": "https://pagetable.com",
|
||||||
|
"url": "https://www.pagetable.com/?feed=rss2",
|
||||||
|
"webhook": "webhook url",
|
||||||
|
"offset": -18000
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Create a webhook for each feed (unless you want them all to show as the same webhook for whatever reason) and make sure to add it in to the config. I have it set up with a webhook for each site, each with the site's icon and name set for the webhook which makes the messages look really nice.
|
||||||
|
|
||||||
|
The offset should only be required if feeds from the previous 6 hours aren't showing up when you first start the script. This is because feedparser, in its infinite wisdom, just ignores the timezone when converting publish dates from feeds. So most feeds end up with an epoch in UTC. The offset should be the number of seconds between your time zone and UTC. This will eventually be fixed in a future update, I just need to sit down and wrangle with feedparser and datetime some more. All fields are mandatory, if you want to have no offset for example, set it to 0. The name and siteurl are used to create the "author" field in the Discord embed.
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
Want to fix something or make a suggestion? Feel free! If you want to send a pull request, you *must* run the Python `black` formatter on the source code before committing. I have this set up in my editor to automatically run every time I save the file, but you could have it run as part of a git hook or something. For non-format stuff, please just follow the code style as best you can. For Python code, I separate multi-word variable names with underscores. So it should be `feed_time`, not `feedTime` or `FeedTime` or `feed-time`. Don't ask me why, but I use camelCase for other languages... but in Python I've switched to underscores.
|
Want to fix something or make a suggestion? Feel free! If you want to send a pull request, you *must* run the Python `black` formatter on the source code before committing. I have this set up in my editor to automatically run every time I save the file, but you could have it run as part of a git hook or something. For non-format stuff, please just follow the code style as best you can. For Python code, I separate multi-word variable names with underscores. So it should be `feed_time`, not `feedTime` or `FeedTime` or `feed-time`. Don't ask me why, but I use camelCase for other languages... but in Python I've switched to underscores.
|
||||||
|
|
|
||||||
281
discorss.py
281
discorss.py
|
|
@ -22,32 +22,94 @@ import os
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
import re
|
import re
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
|
||||||
class Discorss:
|
class Discorss:
|
||||||
FEED_TIMEOUT_SECONDS = 15
|
FEED_TIMEOUT_SECONDS = 15
|
||||||
|
HASH_HISTORY_LIMIT = 10
|
||||||
|
APP_VERSION = "0.3rc2"
|
||||||
|
IMAGE_EXTENSIONS = (".jpg", ".jpeg", ".png", ".gif", ".webp")
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, args=None):
|
||||||
|
if args is None:
|
||||||
|
args = SimpleNamespace(
|
||||||
|
dry_run=False,
|
||||||
|
config_file=None,
|
||||||
|
log_file=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.DRY_RUN = args.dry_run
|
||||||
self.config_dir = os.environ.get("XDG_CONFIG_HOME")
|
self.config_dir = os.environ.get("XDG_CONFIG_HOME")
|
||||||
home_dir = Path.home()
|
home_dir = Path.home()
|
||||||
if self.config_dir is None:
|
if self.config_dir is None:
|
||||||
self.config_file_path = str(home_dir) + "/.config/discorss/discorss.conf"
|
default_config_file_path = str(home_dir) + "/.config/discorss/discorss.conf"
|
||||||
self.config_dir = str(home_dir) + "/.config/discorss"
|
|
||||||
else:
|
else:
|
||||||
self.config_file_path = self.config_dir + r"/discorss/discorss.conf"
|
default_config_file_path = self.config_dir + r"/discorss/discorss.conf"
|
||||||
self.log_dir = r"/var/log/discorss"
|
self.config_file_path = args.config_file or default_config_file_path
|
||||||
self.log_file_path = r"/app.log"
|
self.config_dir = str(Path(self.config_file_path).parent)
|
||||||
|
|
||||||
|
default_log_file_path = "/var/log/discorss/app.log"
|
||||||
|
self.log_file_path = args.log_file or default_log_file_path
|
||||||
|
self.log_dir = str(Path(self.log_file_path).parent)
|
||||||
# Yes, I know you "can't parse HTML with regex", but
|
# Yes, I know you "can't parse HTML with regex", but
|
||||||
# just watch me.
|
# just watch me.
|
||||||
self.html_filter = re.compile(r"\<\/?([A-Za-z0-9 \:\.\-\/\"\=])*\>")
|
self.html_filter = re.compile(r"\<\/?([A-Za-z0-9 \:\.\-\/\"\=])*\>")
|
||||||
|
self.img_src_filter = re.compile(r'<img[^>]+src=["\']([^"\']+)["\']', re.I)
|
||||||
self.success_codes = [200, 201, 202, 203, 204, 205, 206]
|
self.success_codes = [200, 201, 202, 203, 204, 205, 206]
|
||||||
self.app_config = {}
|
self.app_config = {}
|
||||||
|
print(f"Logging to {self.log_file_path}")
|
||||||
|
|
||||||
|
self.now = time.mktime(time.localtime())
|
||||||
|
# Set up logging
|
||||||
|
self.logger = logging.getLogger(__name__)
|
||||||
|
logging.basicConfig(
|
||||||
|
filename=self.log_file_path,
|
||||||
|
encoding="utf-8",
|
||||||
|
level=logging.DEBUG,
|
||||||
|
datefmt="%m/%d/%Y %H:%M:%S",
|
||||||
|
format="%(asctime)s [%(threadName)s] -> %(levelname)s: %(message)s",
|
||||||
|
)
|
||||||
|
# Check for log and config files/paths, create empty directories if needed
|
||||||
|
# TODO: change output to log file, as warning/error
|
||||||
|
if not Path(self.log_dir).exists():
|
||||||
|
self.logger.warning(
|
||||||
|
"No log file path exists. Yark! We'll try and make %s...", self.log_dir
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
Path(self.log_dir).mkdir(parents=True, exist_ok=True)
|
||||||
|
except FileExistsError:
|
||||||
|
self.logger.critical(
|
||||||
|
"The path {} already exists and is not a directory!".format(
|
||||||
|
self.log_dir
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if not Path(self.config_file_path).exists():
|
||||||
|
self.logger.warning(
|
||||||
|
"No config file at {}! Snarf. We'll try and make {}...".format(
|
||||||
|
self.config_file_path, self.config_dir
|
||||||
|
)
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
Path(self.config_dir).mkdir(parents=True, exist_ok=True)
|
||||||
|
except FileExistsError:
|
||||||
|
self.warning.critical(
|
||||||
|
"The config dir {} already exists and is not a directory! Please fix manually. Quitting!".format(
|
||||||
|
self.config_dir
|
||||||
|
)
|
||||||
|
)
|
||||||
|
sys.exit(255)
|
||||||
|
return
|
||||||
|
|
||||||
async def _fetch_feed(self, hook):
|
async def _fetch_feed(self, hook):
|
||||||
response = await asyncio.to_thread(
|
response = await asyncio.to_thread(
|
||||||
requests.get,
|
requests.get,
|
||||||
hook["url"],
|
hook["url"],
|
||||||
headers={"user-agent": "DiscoRSS (https://git.frzn.dev/amr/discorss, 0.2)"},
|
headers={
|
||||||
|
"user-agent": "DiscoRSS (https://git.frzn.dev/amr/discorss, {})".format(
|
||||||
|
self.APP_VERSION
|
||||||
|
)
|
||||||
|
},
|
||||||
timeout=self.FEED_TIMEOUT_SECONDS,
|
timeout=self.FEED_TIMEOUT_SECONDS,
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
|
|
@ -62,24 +124,36 @@ class Discorss:
|
||||||
timeout=self.FEED_TIMEOUT_SECONDS,
|
timeout=self.FEED_TIMEOUT_SECONDS,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _get_hash_history(self, hook):
|
||||||
|
# now we store a list of hashes 10 long
|
||||||
|
# this function checks if it's the old format and updates it if needed
|
||||||
|
existing_hashes = hook.get("lasthash", [])
|
||||||
|
if isinstance(existing_hashes, str):
|
||||||
|
return [existing_hashes]
|
||||||
|
if isinstance(existing_hashes, list):
|
||||||
|
return [
|
||||||
|
saved_hash
|
||||||
|
for saved_hash in existing_hashes
|
||||||
|
if isinstance(saved_hash, str)
|
||||||
|
]
|
||||||
|
return []
|
||||||
|
|
||||||
async def _process_feed(self, hook, last_check):
|
async def _process_feed(self, hook, last_check):
|
||||||
self.logger.debug("Parsing feed %s...", hook["name"])
|
self.logger.debug("Parsing feed %s...", hook["name"])
|
||||||
feeds = await self._fetch_feed(hook)
|
feed = await self._fetch_feed(hook)
|
||||||
latest_post = None
|
latest_post = None
|
||||||
latest_post_time = None
|
|
||||||
prev_best = 0
|
prev_best = 0
|
||||||
bad_time = False
|
bad_time = False
|
||||||
self.logger.debug("About to sort through entries for feed %s ...", hook["name"])
|
self.logger.debug("About to sort through entries for feed %s ...", hook["name"])
|
||||||
for feed in feeds["entries"]:
|
for post in feed["entries"]:
|
||||||
try:
|
try:
|
||||||
published_time = time.mktime(feed["published_parsed"])
|
published_time = time.mktime(post["published_parsed"])
|
||||||
published_time = published_time + hook["offset"]
|
published_time = published_time + hook["offset"]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
published_time = time.mktime(feed["updated_parsed"])
|
published_time = time.mktime(post["updated_parsed"])
|
||||||
bad_time = True
|
bad_time = True
|
||||||
if published_time > prev_best:
|
if published_time > prev_best:
|
||||||
latest_post = feed
|
latest_post = post
|
||||||
latest_post_time = published_time
|
|
||||||
prev_best = published_time
|
prev_best = published_time
|
||||||
|
|
||||||
if latest_post is None:
|
if latest_post is None:
|
||||||
|
|
@ -91,18 +165,18 @@ class Discorss:
|
||||||
"Feed %s doesn't supply a published time, using updated time instead",
|
"Feed %s doesn't supply a published time, using updated time instead",
|
||||||
hook["name"],
|
hook["name"],
|
||||||
)
|
)
|
||||||
# Hash the title and time of the latest post and use that to determine if it's been posted
|
# Hash the url of the latest post and use that to determine if it's been posted
|
||||||
# Yes, SHA3-512 is totally unnecessary for this purpose, but I love SHA3
|
# Yes, SHA3-512 is totally unnecessary for this purpose, but I love SHA3
|
||||||
self.logger.debug("About to hash %s ...", latest_post["title"])
|
self.logger.debug("About to hash %s ...", latest_post["link"])
|
||||||
try:
|
try:
|
||||||
new_hash = hashlib.sha3_512(
|
new_hash = hashlib.sha3_512(
|
||||||
bytes(latest_post["title"] + str(latest_post_time), "utf-8")
|
bytes(latest_post["link"], "utf-8") # Removed time from hash
|
||||||
).hexdigest()
|
).hexdigest()
|
||||||
except TypeError:
|
except TypeError:
|
||||||
self.logger.error("Title of %s isn't hashing correctly", hook["name"])
|
self.logger.error("URL %s isn't hashing correctly", hook["link"])
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if hook.get("lasthash") == new_hash:
|
if new_hash in self._get_hash_history(hook):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Generate the webhook
|
# Generate the webhook
|
||||||
|
|
@ -112,9 +186,7 @@ class Discorss:
|
||||||
last_check,
|
last_check,
|
||||||
self.now,
|
self.now,
|
||||||
)
|
)
|
||||||
webhook = {
|
embed = {
|
||||||
"embeds": [
|
|
||||||
{
|
|
||||||
"title": str(latest_post["title"]),
|
"title": str(latest_post["title"]),
|
||||||
"url": str(latest_post["link"]),
|
"url": str(latest_post["link"]),
|
||||||
"color": 2123412,
|
"color": 2123412,
|
||||||
|
|
@ -134,17 +206,35 @@ class Discorss:
|
||||||
],
|
],
|
||||||
# "timestamp": str(self.now),
|
# "timestamp": str(self.now),
|
||||||
}
|
}
|
||||||
],
|
self.logger.debug(
|
||||||
|
"Checking for images in post %s from %s...",
|
||||||
|
latest_post["title"],
|
||||||
|
hook["name"],
|
||||||
|
)
|
||||||
|
image_url = self.get_image_url(latest_post)
|
||||||
|
if image_url is not None:
|
||||||
|
embed["thumbnail"] = {"url": image_url}
|
||||||
|
|
||||||
|
webhook = {
|
||||||
|
"embeds": [embed],
|
||||||
"attachments": [],
|
"attachments": [],
|
||||||
}
|
}
|
||||||
custom_header = {
|
custom_header = {
|
||||||
"user-agent": "DiscoRSS (https://git.frzn.dev/amr/discorss, 0.2)",
|
"user-agent": "DiscoRSS (https://git.frzn.dev/amr/discorss, {})".format(
|
||||||
|
self.APP_VERSION
|
||||||
|
),
|
||||||
"content-type": "application/json",
|
"content-type": "application/json",
|
||||||
}
|
}
|
||||||
webhook_string = json.dumps(webhook)
|
webhook_string = json.dumps(webhook)
|
||||||
|
|
||||||
self.logger.debug("About to run POST for %s", hook["name"])
|
self.logger.debug("About to run POST for %s", hook["name"])
|
||||||
|
if not self.DRY_RUN:
|
||||||
response = await self._post_webhook(hook, webhook_string, custom_header)
|
response = await self._post_webhook(hook, webhook_string, custom_header)
|
||||||
|
else:
|
||||||
|
self.logger.info(
|
||||||
|
"Dry run, not actually posting to webhook, faking return code 200"
|
||||||
|
)
|
||||||
|
response = SimpleNamespace(status_code=200)
|
||||||
if response.status_code not in self.success_codes:
|
if response.status_code not in self.success_codes:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
"Error %d while trying to post %s", response.status_code, hook["name"]
|
"Error %d while trying to post %s", response.status_code, hook["name"]
|
||||||
|
|
@ -168,14 +258,14 @@ class Discorss:
|
||||||
for i, result in enumerate(results):
|
for i, result in enumerate(results):
|
||||||
hook = self.app_config["feeds"][i]
|
hook = self.app_config["feeds"][i]
|
||||||
if isinstance(result, asyncio.TimeoutError):
|
if isinstance(result, asyncio.TimeoutError):
|
||||||
self.logger.error(
|
self.logger.critical(
|
||||||
"Timed out processing feed %s after %d seconds",
|
"Timed out processing feed %s after %d seconds",
|
||||||
hook["name"],
|
hook["name"],
|
||||||
self.FEED_TIMEOUT_SECONDS,
|
self.FEED_TIMEOUT_SECONDS,
|
||||||
)
|
)
|
||||||
continue
|
continue
|
||||||
if isinstance(result, requests.RequestException):
|
if isinstance(result, requests.RequestException):
|
||||||
self.logger.error(
|
self.logger.critical(
|
||||||
"Network error while processing feed %s: %s",
|
"Network error while processing feed %s: %s",
|
||||||
hook["name"],
|
hook["name"],
|
||||||
result,
|
result,
|
||||||
|
|
@ -191,17 +281,21 @@ class Discorss:
|
||||||
if result is None:
|
if result is None:
|
||||||
continue
|
continue
|
||||||
if "lasthash" not in hook:
|
if "lasthash" not in hook:
|
||||||
self.logger.info(
|
self.logger.debug(
|
||||||
"Feed %s has no existing hash, likely a new feed!", hook["name"]
|
"Feed %s has no existing hash, likely a new feed!", hook["name"]
|
||||||
)
|
)
|
||||||
self.app_config["feeds"][i]["lasthash"] = result
|
hash_history = self._get_hash_history(hook)
|
||||||
|
hash_history.append(result)
|
||||||
|
if len(hash_history) > self.HASH_HISTORY_LIMIT:
|
||||||
|
hash_history = hash_history[-self.HASH_HISTORY_LIMIT :]
|
||||||
|
self.app_config["feeds"][i]["lasthash"] = hash_history
|
||||||
|
|
||||||
# This function gets and formats the brief excerpt that goes in the embed
|
# This function gets and formats the brief excerpt that goes in the embed
|
||||||
# Different feeds put summaries in different fields, so we pick the best
|
# Different feeds put summaries in different fields, so we pick the best
|
||||||
# one and limit it to 250 characters.
|
# one and limit it to 250 characters.
|
||||||
def get_description(self, feed, length=250, min_length=150, addons=None):
|
def get_description(self, post, length=250, min_length=150, addons=None):
|
||||||
try:
|
try:
|
||||||
temporary_string = str(feed["summary_detail"]["value"])
|
temporary_string = str(post["summary_detail"]["value"])
|
||||||
temporary_string = self.html_filter.sub("", temporary_string)
|
temporary_string = self.html_filter.sub("", temporary_string)
|
||||||
while length > min_length:
|
while length > min_length:
|
||||||
if temporary_string[length - 1 : length] == " ":
|
if temporary_string[length - 1 : length] == " ":
|
||||||
|
|
@ -209,7 +303,7 @@ class Discorss:
|
||||||
else:
|
else:
|
||||||
length -= 1
|
length -= 1
|
||||||
except KeyError:
|
except KeyError:
|
||||||
temporary_string = str(feed["description"])
|
temporary_string = str(post["description"])
|
||||||
temporary_string = self.html_filter.sub("", temporary_string)
|
temporary_string = self.html_filter.sub("", temporary_string)
|
||||||
while length > min_length:
|
while length > min_length:
|
||||||
if temporary_string[length - 1 : length] == " ":
|
if temporary_string[length - 1 : length] == " ":
|
||||||
|
|
@ -222,59 +316,72 @@ class Discorss:
|
||||||
desc = desc + str(addons)
|
desc = desc + str(addons)
|
||||||
return desc
|
return desc
|
||||||
|
|
||||||
# Some of this could go in __init__
|
# attempting to extract image previews from feeds which primarily feature
|
||||||
|
# images, like NASA's Picture of the Day feed
|
||||||
|
def get_image_url(self, post):
|
||||||
|
image_candidates = []
|
||||||
|
# check the most common fields, this should catch the majority of image
|
||||||
|
# feeds' embedded urls
|
||||||
|
for media in post.get("media_content", []):
|
||||||
|
if self.is_image_url(media.get("url"), media.get("type")):
|
||||||
|
image_candidates.append(media["url"])
|
||||||
|
|
||||||
|
for enclosure in post.get("enclosures", []):
|
||||||
|
if self.is_image_url(enclosure.get("href"), enclosure.get("type")):
|
||||||
|
image_candidates.append(enclosure["href"])
|
||||||
|
|
||||||
|
for link in post.get("links", []):
|
||||||
|
if self.is_image_url(link.get("href"), link.get("type")):
|
||||||
|
image_candidates.append(link["href"])
|
||||||
|
|
||||||
|
for media in post.get("media_thumbnail", []):
|
||||||
|
if self.is_image_url(media.get("url"), media.get("type")):
|
||||||
|
image_candidates.append(media["url"])
|
||||||
|
|
||||||
|
for field in ["summary_detail", "content"]:
|
||||||
|
value = post.get(field)
|
||||||
|
if isinstance(value, list):
|
||||||
|
values = [item.get("value", "") for item in value]
|
||||||
|
elif isinstance(value, dict):
|
||||||
|
values = [value.get("value", "")]
|
||||||
|
else:
|
||||||
|
values = []
|
||||||
|
for text in values:
|
||||||
|
match = self.img_src_filter.search(str(text))
|
||||||
|
if match and self.is_image_url(match.group(1)):
|
||||||
|
image_candidates.append(match.group(1))
|
||||||
|
self.logger.debug("Found the following image candidates in %s...", post["title"])
|
||||||
|
for i in image_candidates:
|
||||||
|
self.logger.debug("%s", i)
|
||||||
|
if len(image_candidates) > 0:
|
||||||
|
return image_candidates[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
# a silly little helper just to validate image links
|
||||||
|
# this isn't 100% foolproof but it should work for the
|
||||||
|
# vast majority of feeds out there, unless they use some
|
||||||
|
# really weird image type like image/bpg
|
||||||
|
def is_image_url(self, url, mime_type=None):
|
||||||
|
if not url:
|
||||||
|
return False
|
||||||
|
if mime_type and str(mime_type).lower().startswith("image/"):
|
||||||
|
return True
|
||||||
|
# this will fix urls with ? in them for parameters. this should work
|
||||||
|
# unless the server depends on the parameter to creat the image, but
|
||||||
|
# in that case we'll just hope it has a mime_type instead
|
||||||
|
return str(url).lower().split("?", 1)[0].endswith(self.IMAGE_EXTENSIONS)
|
||||||
|
|
||||||
def setup(self):
|
def setup(self):
|
||||||
os.environ["TZ"] = "America/Toronto"
|
os.environ["TZ"] = "America/Toronto"
|
||||||
time.tzset()
|
time.tzset()
|
||||||
self.now = time.mktime(time.localtime())
|
|
||||||
# Check for log and config files/paths, create empty directories if needed
|
|
||||||
# TODO: change output to log file, as warning/error
|
|
||||||
if not Path(self.log_dir).exists():
|
|
||||||
print(
|
|
||||||
"No log file path exists. Yark! We'll try and make {}...".format(
|
|
||||||
self.log_dir
|
|
||||||
)
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
Path(self.log_dir).mkdir(parents=True, exist_ok=True)
|
|
||||||
except FileExistsError:
|
|
||||||
print(
|
|
||||||
"The path {} already exists and is not a directory!".format(
|
|
||||||
self.log_dir
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if not Path(self.config_file_path).exists():
|
|
||||||
print(
|
|
||||||
"No config file at {}! Snarf. We'll try and make {}...".format(
|
|
||||||
self.config_file_path, self.config_dir
|
|
||||||
)
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
Path(self.config_dir).mkdir(parents=True, exist_ok=True)
|
|
||||||
except FileExistsError:
|
|
||||||
print(
|
|
||||||
"The config dir {} already exists and is not a directory! Please fix manually. Quitting!".format(
|
|
||||||
self.config_dir
|
|
||||||
)
|
|
||||||
)
|
|
||||||
sys.exit(255)
|
|
||||||
return
|
|
||||||
# Loading the config file
|
# Loading the config file
|
||||||
with open(self.config_file_path, "r") as config_file:
|
with open(self.config_file_path, "r") as config_file:
|
||||||
self.app_config = json.load(config_file)
|
self.app_config = json.load(config_file)
|
||||||
# Set up logging
|
|
||||||
self.logger = logging.getLogger(__name__)
|
|
||||||
logging.basicConfig(
|
|
||||||
filename=str(self.log_dir + self.log_file_path),
|
|
||||||
encoding="utf-8",
|
|
||||||
level=logging.ERROR,
|
|
||||||
datefmt="%m/%d/%Y %H:%M:%S",
|
|
||||||
format="%(asctime)s: %(levelname)s: %(message)s",
|
|
||||||
)
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def process(self):
|
def process(self):
|
||||||
self.setup() # Handle the config and log paths
|
self.setup() # Handle the config and log paths
|
||||||
|
self.logger.info("Starting DiscoRSS version {}...".format(self.APP_VERSION))
|
||||||
try:
|
try:
|
||||||
last_check = self.app_config["lastupdate"]
|
last_check = self.app_config["lastupdate"]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
|
|
@ -289,6 +396,7 @@ class Discorss:
|
||||||
with open(self.config_file_path, "w") as config_file:
|
with open(self.config_file_path, "w") as config_file:
|
||||||
json.dump(self.app_config, config_file, indent=4)
|
json.dump(self.app_config, config_file, indent=4)
|
||||||
|
|
||||||
|
self.logger.info("Stopping DiscoRSS version {}...".format(self.APP_VERSION))
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -296,7 +404,30 @@ class Discorss:
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
app = Discorss()
|
parser = argparse.ArgumentParser(
|
||||||
|
description="\x1b[1;34mDisco\x1b[33mRSS\x1b[0m: publish feed updates to Discord webhooks."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-d",
|
||||||
|
"--dry-run",
|
||||||
|
action="store_true",
|
||||||
|
help="Parse feeds and update state without posting to Discord.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-c",
|
||||||
|
"--config-file",
|
||||||
|
default=None,
|
||||||
|
help="Alternate config file path. Defaults to the existing config location.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-l",
|
||||||
|
"--log-file",
|
||||||
|
default=None,
|
||||||
|
help="Alternate log file path. Defaults to the existing log location.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
app = Discorss(args)
|
||||||
app.process()
|
app.process()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue