From d635f1cbc8452ef3aff81486aa1153ec6f74bf88 Mon Sep 17 00:00:00 2001 From: Jake Walker Date: Tue, 21 Jan 2025 23:40:23 +0000 Subject: [PATCH] initial commit --- .editorconfig | 15 +++++ .gitignore | 122 +++++++++++++++++++++++++++++++++++++++++ .vscode/settings.json | 3 + .woodpecker/deploy.yml | 19 +++++++ Cargo.toml | 12 ++++ LICENSE | 12 ++++ README.md | 3 + src/lib.rs | 1 + src/main.rs | 99 +++++++++++++++++++++++++++++++++ src/parser.rs | 77 ++++++++++++++++++++++++++ 10 files changed, 363 insertions(+) create mode 100644 .editorconfig create mode 100644 .gitignore create mode 100644 .vscode/settings.json create mode 100644 .woodpecker/deploy.yml create mode 100644 Cargo.toml create mode 100644 LICENSE create mode 100644 README.md create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/parser.rs diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..1716961 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,15 @@ +# EditorConfig is awesome: https://EditorConfig.org + +# top-most EditorConfig file +root = true + +[*] +indent_style = space +indent_size = 4 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true + +[*.{yml,yaml}] +indent_size = 2 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..cc9e621 --- /dev/null +++ b/.gitignore @@ -0,0 +1,122 @@ +# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig +# Created by https://www.toptal.com/developers/gitignore/api/windows,visualstudiocode,rust,macos,linux,rust-analyzer +# Edit at https://www.toptal.com/developers/gitignore?templates=windows,visualstudiocode,rust,macos,linux,rust-analyzer + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### Rust ### +# Generated by Cargo +# will have compiled files and executables +debug/ +target/ + +# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries +# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html +Cargo.lock + +# These are backup files generated by rustfmt +**/*.rs.bk + +# MSVC Windows builds of rustc generate these, which store debugging information +*.pdb + +### rust-analyzer ### +# Can be generated by other build systems other than cargo (ex: bazelbuild/rust_rules) +rust-project.json + + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +# End of https://www.toptal.com/developers/gitignore/api/windows,visualstudiocode,rust,macos,linux,rust-analyzer + +# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) +dist diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b8c0095 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "licenser.license": "0BSD" +} diff --git a/.woodpecker/deploy.yml b/.woodpecker/deploy.yml new file mode 100644 index 0000000..c3ced82 --- /dev/null +++ b/.woodpecker/deploy.yml @@ -0,0 +1,19 @@ +when: + - event: push + branch: ${CI_DEFAULT_REPO} + - event: cron + cron: daily + - event: manual + +steps: + - name: generate + image: rust + commands: + - cargo run + - name: deploy + image: node:alpine + commands: + - npx wrangler pages deploy ./dist --project-name mininews + environment: + CLOUDFLARE_API_TOKEN: + from_secret: cf_token diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..7104258 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "mininews" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = "1.0.95" +chrono = { version = "0.4.39", default-features = false, features = ["now"] } +kuchikiki = "0.8.2" +reqwest = { version = "0.12.12", features = ["blocking"] } +rss = { version = "2.0.11" } +uuid = { version = "1.12.1", features = ["v7"] } diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..ddb1353 --- /dev/null +++ b/LICENSE @@ -0,0 +1,12 @@ + Zero-Clause BSD / Free Public License 1.0.0 (0BSD) + + Permission to use, copy, modify, and/or distribute this software for any purpose + with or without fee is hereby granted. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + PERFORMANCE OF THIS SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..ea491dc --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# Mininews + +A simple RSS generator for [Wikipedia's Current Events](https://en.wikipedia.org/wiki/Portal:Current_events). Inspired by [tom-james-watson's detoxed.news](https://github.com/tom-james-watson/detoxed.news) project. diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..67c567f --- /dev/null +++ b/src/lib.rs @@ -0,0 +1 @@ +pub mod parser; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..6294ced --- /dev/null +++ b/src/main.rs @@ -0,0 +1,99 @@ +use std::{fs, io::ErrorKind}; + +use anyhow::{bail, Context, Error, Result}; +use chrono::{Datelike, Days, NaiveTime, Utc}; +use mininews::parser::{parse, EventBlock, PAGE_URL}; +use reqwest::blocking::get; +use rss::{ChannelBuilder, Guid, Item, ItemBuilder}; +use uuid::{Timestamp, Uuid}; + +fn generate_feed(items: &Vec, exclude_today: bool) -> Result { + let mut channel = ChannelBuilder::default() + .title("Mininews") + .link("https://example.com") + .description("An RSS feed from Wikipedia's Current Events") + .build(); + + channel.set_items( + items + .iter() + .filter(|x| !exclude_today || x.date < Utc::now().date_naive()) + .map(|x| { + let pub_date = x + .date + .checked_add_days(Days::new(1)) + .context("failed to add to date")? + .and_time(NaiveTime::MIN) + .and_utc(); + Ok::( + ItemBuilder::default() + .title(format!( + "Current Events - {}{} {}", + x.date.format("%A %-d").to_string(), + match x.date.day() { + 1 | 21 | 31 => "st", + 2 | 22 => "nd", + 3 | 23 => "rd", + _ => "th", + }, + x.date.format("%B").to_string() + )) + .link(Some(PAGE_URL.to_string())) + .pub_date(pub_date.to_rfc2822()) + .guid(Guid { + permalink: false, + value: Uuid::new_v7(Timestamp::from_unix_time( + pub_date.timestamp() as u64, + 0, + 0, + 0, + )) + .to_string(), + }) + .description(format!( + "Wikipedia current events from {}", + x.date.to_string() + )) + .content(x.content.clone()) + .build(), + ) + }) + .collect::, _>>()?, + ); + + Ok(channel.to_string()) +} + +fn fetch_document() -> Result { + Ok(get(PAGE_URL) + .context("failed to get page")? + .text() + .context("failed to get page content")?) +} + +fn main() -> Result<()> { + println!("fetching document..."); + let document = fetch_document()?; + println!("parsing..."); + let items = parse(&document)?; + + if let Err(err) = fs::create_dir("dist") { + match err.kind() { + ErrorKind::AlreadyExists => (), + _ => bail!("failed to create directory: {}", err), + } + } + + for exclude_today in [true, false] { + let feed = generate_feed(&items, exclude_today)?; + let suffix = { + match exclude_today { + false => "_with_today", + _ => "", + } + }; + fs::write(format!("dist/feed{}.xml", suffix), feed).context("failed to write file")?; + } + + Ok(()) +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..9c10b24 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,77 @@ +use anyhow::{anyhow, Context, Result}; +use chrono::NaiveDate; +use kuchikiki::parse_html; +use kuchikiki::traits::*; +use kuchikiki::NodeRef; + +const RELATIVE_URL_BASE: &str = "https://en.wikipedia.org"; +pub const PAGE_URL: &str = "https://en.wikipedia.org/wiki/Portal:Current_events"; + +pub struct EventBlock { + pub date: NaiveDate, + pub content: String, +} + +fn parse_event_block(node: &NodeRef) -> Result { + let element = node + .as_element() + .context("failed to parse element")? + .clone(); + + let element_attributes = element.attributes.borrow(); + + let date_str = element_attributes + .get("id") + .context("could not get event block id")?; + let date = NaiveDate::parse_from_str(date_str, "%Y_%B_%-d") + .map_err(|e| anyhow!(e.to_string())) + .with_context(|| "failed to parse event block date")?; + + let content = node + .select_first("div.current-events-content") + .map_err(|_| anyhow!("failed to select event blocks"))?; + let content_node = content.as_node(); + + // rewrite relative links + for link in content_node + .select("a") + .map_err(|_| anyhow!("failed to select event block links"))? + { + let mut link_attributes = link + .as_node() + .as_element() + .context("failed to parse event block link")? + .attributes + .borrow_mut(); + + let mut href = link_attributes + .get("href") + .context("link has no href")? + .to_string(); + + if !href.starts_with("/") { + continue; + } + + href = format!("{}{}", RELATIVE_URL_BASE, href); + + link_attributes.insert("href", href); + } + + Ok(EventBlock { + date, + content: content_node.to_string(), + }) +} + +pub fn parse(content: &str) -> Result> { + let document = parse_html().one(content); + + document + .select("div.p-current-events-events div.current-events-main.vevent") + .map_err(|_| anyhow!("failed to select event blocks"))? + .into_iter() + .filter(|el| !el.as_node().select_first("li.mw-empty-elt").is_ok()) + .map(|el| parse_event_block(el.as_node())) + .collect() +}