From 423ef7455fa70c28d4c0eef05f9487281e5ccfec Mon Sep 17 00:00:00 2001 From: Jake Walker Date: Tue, 21 Jan 2025 23:40:56 +0000 Subject: [PATCH] initial commit --- .editorconfig | 12 +++++ .gitignore | 122 ++++++++++++++++++++++++++++++++++++++++++ .vscode/settings.json | 3 ++ Cargo.toml | 11 ++++ LICENSE | 13 +++++ README.md | 3 ++ src/lib.rs | 1 + src/main.rs | 58 ++++++++++++++++++++ src/parser.rs | 77 ++++++++++++++++++++++++++ 9 files changed, 300 insertions(+) create mode 100644 .editorconfig create mode 100644 .gitignore create mode 100644 .vscode/settings.json create mode 100644 Cargo.toml create mode 100644 LICENSE create mode 100644 README.md create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/parser.rs diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..c1e2c64 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +# EditorConfig is awesome: https://EditorConfig.org + +# top-most EditorConfig file +root = true + +[*] +indent_style = space +indent_size = 4 +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..06b6ee6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,122 @@ +# File created using '.gitignore Generator' for Visual Studio Code: https://bit.ly/vscode-gig +# Created by https://www.toptal.com/developers/gitignore/api/windows,visualstudiocode,rust,macos,linux,rust-analyzer +# Edit at https://www.toptal.com/developers/gitignore?templates=windows,visualstudiocode,rust,macos,linux,rust-analyzer + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### Rust ### +# Generated by Cargo +# will have compiled files and executables +debug/ +target/ + +# Remove Cargo.lock from gitignore if creating an executable, leave it for libraries +# More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html +Cargo.lock + +# These are backup files generated by rustfmt +**/*.rs.bk + +# MSVC Windows builds of rustc generate these, which store debugging information +*.pdb + +### rust-analyzer ### +# Can be generated by other build systems other than cargo (ex: bazelbuild/rust_rules) +rust-project.json + + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +# End of https://www.toptal.com/developers/gitignore/api/windows,visualstudiocode,rust,macos,linux,rust-analyzer + +# Custom rules (everything added below won't be overriden by 'Generate .gitignore File' if you use 'Update' option) + +sample.html diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b29bf32 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "licenser.license": "WTFPL" +} diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..fca7de7 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "mininews" +version = "0.1.0" +edition = "2021" + +[dependencies] +anyhow = "1.0.95" +chrono = { version = "0.4.39", default-features = false, features = ["now"] } +kuchikiki = "0.8.2" +rss = { version = "2.0.11" } +uuid = { version = "1.12.1", features = ["v7"] } diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..eeb8a94 --- /dev/null +++ b/LICENSE @@ -0,0 +1,13 @@ +DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE +Version 2, December 2004 + +Copyright (C) 2025 Jake Walker + +Everyone is permitted to copy and distribute verbatim or modified +copies of this license document, and changing it is allowed as long +as the name is changed. + +DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE +TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + +0. You just DO WHAT THE FUCK YOU WANT TO. diff --git a/README.md b/README.md new file mode 100644 index 0000000..ea491dc --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# Mininews + +A simple RSS generator for [Wikipedia's Current Events](https://en.wikipedia.org/wiki/Portal:Current_events). Inspired by [tom-james-watson's detoxed.news](https://github.com/tom-james-watson/detoxed.news) project. diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..67c567f --- /dev/null +++ b/src/lib.rs @@ -0,0 +1 @@ +pub mod parser; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..4e0c496 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,58 @@ +use anyhow::{Context, Error, Result}; +use chrono::{Days, NaiveTime, Utc}; +use mininews::parser::{parse, EventBlock, PAGE_URL}; +use rss::{ChannelBuilder, Guid, Item, ItemBuilder}; +use uuid::{Timestamp, Uuid}; + +fn generate_feed(items: Vec) -> Result<()> { + let mut channel = ChannelBuilder::default() + .title("Mininews") + .link("https://example.com") + .description("An RSS feed from Wikipedia's Current Events") + .build(); + + channel.set_items( + items + .iter() + .filter(|x| x.date < Utc::now().date_naive()) + .map(|x| { + let pub_date = x + .date + .checked_add_days(Days::new(1)) + .context("failed to add to date")? + .and_time(NaiveTime::MIN) + .and_utc(); + Ok::( + ItemBuilder::default() + .title(x.date.to_string()) + .link(Some(PAGE_URL.to_string())) + .pub_date(pub_date.to_rfc2822()) + .guid(Guid { + permalink: false, + value: Uuid::new_v7(Timestamp::from_unix_time( + pub_date.timestamp() as u64, + 0, + 0, + 0, + )) + .to_string(), + }) + .description(format!( + "Wikipedia current events from {}", + x.date.to_string() + )) + .content(x.content.clone()) + .build(), + ) + }) + .collect::, _>>()?, + ); + + println!("{:?}", channel.to_string()); + + Ok(()) +} + +fn main() -> Result<()> { + generate_feed(parse()?) +} diff --git a/src/parser.rs b/src/parser.rs new file mode 100644 index 0000000..6e3ff30 --- /dev/null +++ b/src/parser.rs @@ -0,0 +1,77 @@ +use anyhow::{anyhow, Context, Result}; +use chrono::NaiveDate; +use kuchikiki::parse_html; +use kuchikiki::traits::*; +use kuchikiki::NodeRef; + +const RELATIVE_URL_BASE: &str = "https://en.wikipedia.org"; +pub const PAGE_URL: &str = "https://en.wikipedia.org/wiki/Portal:Current_events"; + +pub struct EventBlock { + pub date: NaiveDate, + pub content: String, +} + +fn parse_event_block(node: &NodeRef) -> Result { + let element = node + .as_element() + .context("failed to parse element")? + .clone(); + + let element_attributes = element.attributes.borrow(); + + let date_str = element_attributes + .get("id") + .context("could not get event block id")?; + let date = NaiveDate::parse_from_str(date_str, "%Y_%B_%-d") + .map_err(|e| anyhow!(e.to_string())) + .with_context(|| "failed to parse event block date")?; + + let content = node + .select_first("div.current-events-content") + .map_err(|_| anyhow!("failed to select event blocks"))?; + let content_node = content.as_node(); + + // rewrite relative links + for link in content_node + .select("a") + .map_err(|_| anyhow!("failed to select event block links"))? + { + let mut link_attributes = link + .as_node() + .as_element() + .context("failed to parse event block link")? + .attributes + .borrow_mut(); + + let mut href = link_attributes + .get("href") + .context("link has no href")? + .to_string(); + + if !href.starts_with("/") { + continue; + } + + href = format!("{}{}", RELATIVE_URL_BASE, href); + + link_attributes.insert("href", href); + } + + Ok(EventBlock { + date, + content: content_node.to_string(), + }) +} + +pub fn parse() -> Result> { + let data = include_str!("../sample.html"); + let document = parse_html().one(data); + + document + .select("div.p-current-events-events div.current-events-main.vevent") + .map_err(|_| anyhow!("failed to select event blocks"))? + .into_iter() + .map(|el| parse_event_block(el.as_node())) + .collect() +}