implement redis streams publishing
This commit is contained in:
parent
1f386fd002
commit
02ab5ecaa6
5 changed files with 137 additions and 40 deletions
|
@ -12,3 +12,8 @@ scraper = "0.22.0"
|
||||||
clokwerk = "0.4.0"
|
clokwerk = "0.4.0"
|
||||||
log = "0.4.22"
|
log = "0.4.22"
|
||||||
ctrlc = "3.4.5"
|
ctrlc = "3.4.5"
|
||||||
|
clap = { version = "4.5.23", features = ["derive"] }
|
||||||
|
redis = { version = "0.27.6", features = ["tokio-comp"] }
|
||||||
|
md5 = "0.7.0"
|
||||||
|
serde = { version = "1.0.216", features = ["derive"] }
|
||||||
|
serde_json = "1.0.134"
|
||||||
|
|
17
scrapper/src/cli.rs
Normal file
17
scrapper/src/cli.rs
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
use clap::Parser;
|
||||||
|
|
||||||
|
#[derive(Parser, Debug)]
|
||||||
|
#[command(version, about, long_about = None)]
|
||||||
|
pub struct CliArgs {
|
||||||
|
/// Redis host
|
||||||
|
#[arg(short, long)]
|
||||||
|
pub redis_connection_string: String,
|
||||||
|
|
||||||
|
/// Redis stream name
|
||||||
|
#[arg(short = 't', long)]
|
||||||
|
pub redis_stream_name: String,
|
||||||
|
|
||||||
|
/// The scraping interval in minutes
|
||||||
|
#[arg(short, long, default_value_t = 60)]
|
||||||
|
pub scrape_interval_minutes: u32,
|
||||||
|
}
|
|
@ -1,5 +1,8 @@
|
||||||
|
use crate::cli::CliArgs;
|
||||||
|
use crate::redis::RedisService;
|
||||||
use crate::scrapper::gfourmedia::G4Media;
|
use crate::scrapper::gfourmedia::G4Media;
|
||||||
use crate::scrapper::{NewsPost, WebScrapperEngine};
|
use crate::scrapper::{NewsPost, WebScrapperEngine};
|
||||||
|
use clap::Parser;
|
||||||
use clokwerk::{AsyncScheduler, Interval, TimeUnits};
|
use clokwerk::{AsyncScheduler, Interval, TimeUnits};
|
||||||
use log::{debug, error, info};
|
use log::{debug, error, info};
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
@ -8,47 +11,10 @@ use std::sync::{mpsc, Arc};
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tokio::task::JoinHandle;
|
use tokio::task::JoinHandle;
|
||||||
|
|
||||||
|
mod cli;
|
||||||
|
mod redis;
|
||||||
mod scrapper;
|
mod scrapper;
|
||||||
|
|
||||||
#[tokio::main]
|
|
||||||
async fn main() -> Result<(), anyhow::Error> {
|
|
||||||
env_logger::init();
|
|
||||||
info!("Starting the program");
|
|
||||||
|
|
||||||
// Scheduler setup
|
|
||||||
let mut scheduler = AsyncScheduler::new();
|
|
||||||
|
|
||||||
// Channel for synchronizing the scrapper and the bot
|
|
||||||
let (tx, rx): (Sender<NewsPost>, Receiver<NewsPost>) = mpsc::channel();
|
|
||||||
|
|
||||||
// Graceful shutdown.
|
|
||||||
let running = Arc::new(AtomicBool::new(true));
|
|
||||||
let r = running.clone();
|
|
||||||
ctrlc::set_handler(move || {
|
|
||||||
r.store(false, Ordering::SeqCst);
|
|
||||||
})
|
|
||||||
.expect("Error setting Ctrl-C handler");
|
|
||||||
|
|
||||||
run_scrapping_job(&mut scheduler, tx, 60.minutes());
|
|
||||||
|
|
||||||
// Run the scheduler in a separate thread.
|
|
||||||
let handle = run_scheduler(scheduler, running.clone());
|
|
||||||
|
|
||||||
for news_post in rx.iter() {
|
|
||||||
if !running.load(Ordering::SeqCst) {
|
|
||||||
debug!("Used requested shutdown.");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
info!("Received post {:?}", news_post)
|
|
||||||
}
|
|
||||||
|
|
||||||
info!("Stopped the program");
|
|
||||||
|
|
||||||
handle.await?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Runs the scheduler in a separated thread.
|
/// Runs the scheduler in a separated thread.
|
||||||
///
|
///
|
||||||
/// If CTRL+C is pressed it will set `running` to `true`.
|
/// If CTRL+C is pressed it will set `running` to `true`.
|
||||||
|
@ -82,3 +48,54 @@ fn run_scrapping_job(scheduler: &mut AsyncScheduler, tx: Sender<NewsPost>, inter
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<(), anyhow::Error> {
|
||||||
|
env_logger::init();
|
||||||
|
let args = CliArgs::parse();
|
||||||
|
info!("Starting the program");
|
||||||
|
|
||||||
|
// Redis setup
|
||||||
|
let mut redis_service =
|
||||||
|
RedisService::new(args.redis_connection_string, args.redis_stream_name).await;
|
||||||
|
|
||||||
|
// Scheduler setup
|
||||||
|
let mut scheduler = AsyncScheduler::new();
|
||||||
|
|
||||||
|
// Channel for synchronizing the scrapper and the bot
|
||||||
|
let (tx, rx): (Sender<NewsPost>, Receiver<NewsPost>) = mpsc::channel();
|
||||||
|
|
||||||
|
// Graceful shutdown.
|
||||||
|
let running = Arc::new(AtomicBool::new(true));
|
||||||
|
let r = running.clone();
|
||||||
|
ctrlc::set_handler(move || {
|
||||||
|
r.store(false, Ordering::SeqCst);
|
||||||
|
})
|
||||||
|
.expect("Error setting Ctrl-C handler");
|
||||||
|
|
||||||
|
run_scrapping_job(&mut scheduler, tx, args.scrape_interval_minutes.minutes());
|
||||||
|
|
||||||
|
// Run the scheduler in a separate thread.
|
||||||
|
let handle = run_scheduler(scheduler, running.clone());
|
||||||
|
|
||||||
|
for news_post in rx.iter() {
|
||||||
|
if !running.load(Ordering::SeqCst) {
|
||||||
|
debug!("Used requested shutdown.");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
info!("Received post {:?}", news_post);
|
||||||
|
if news_post.is_complete() {
|
||||||
|
let title = news_post.title.clone().unwrap();
|
||||||
|
if redis_service.is_post_seen(&title).await {
|
||||||
|
redis_service.publish(news_post).await;
|
||||||
|
redis_service.mark_post_seen(&title, 60 * 60 * 24 * 3).await;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Stopped the program");
|
||||||
|
|
||||||
|
handle.await?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
56
scrapper/src/redis.rs
Normal file
56
scrapper/src/redis.rs
Normal file
|
@ -0,0 +1,56 @@
|
||||||
|
use crate::scrapper::NewsPost;
|
||||||
|
use log::error;
|
||||||
|
use redis::aio::MultiplexedConnection;
|
||||||
|
use redis::{AsyncCommands, RedisError};
|
||||||
|
|
||||||
|
pub struct RedisService {
|
||||||
|
multiplexed_connection: MultiplexedConnection,
|
||||||
|
stream_name: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RedisService {
|
||||||
|
/// Creates a new RedisService instance.
|
||||||
|
pub async fn new(connection_string: String, stream_name: String) -> Self {
|
||||||
|
let client = redis::Client::open(connection_string).unwrap();
|
||||||
|
let con = client.get_multiplexed_async_connection().await.unwrap();
|
||||||
|
|
||||||
|
RedisService {
|
||||||
|
multiplexed_connection: con,
|
||||||
|
stream_name,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//noinspection RsSelfConvention
|
||||||
|
/// Returns true if the key exists in Redis, false otherwise.
|
||||||
|
pub async fn is_post_seen(&mut self, title: &str) -> bool {
|
||||||
|
let digest = md5::compute(title);
|
||||||
|
let result: Result<bool, RedisError> = self
|
||||||
|
.multiplexed_connection
|
||||||
|
.get(format!("{:x}", digest))
|
||||||
|
.await;
|
||||||
|
result.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Marks the post as seen
|
||||||
|
pub async fn mark_post_seen(&mut self, title: &str, ttl: u64) {
|
||||||
|
let digest = md5::compute(title);
|
||||||
|
let _ = self
|
||||||
|
.multiplexed_connection
|
||||||
|
.set_ex::<String, bool, bool>(format!("{:x}", digest), true, ttl)
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Publishes the post to the redis stream.
|
||||||
|
pub async fn publish(&mut self, post: NewsPost) {
|
||||||
|
let serialized_post = serde_json::to_string(&post).unwrap();
|
||||||
|
let result = redis::cmd("XADD")
|
||||||
|
.arg(format!("posts:{}", self.stream_name))
|
||||||
|
.arg("*")
|
||||||
|
.arg(serialized_post)
|
||||||
|
.exec_async(&mut self.multiplexed_connection)
|
||||||
|
.await;
|
||||||
|
if result.is_err() {
|
||||||
|
error!("Failed to publish {:?} to stream", post);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,7 +1,9 @@
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
pub(crate) mod gfourmedia;
|
pub(crate) mod gfourmedia;
|
||||||
|
|
||||||
/// NewsPost represents a news post.
|
/// NewsPost represents a news post.
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Serialize, Deserialize, Debug, Clone)]
|
||||||
pub struct NewsPost {
|
pub struct NewsPost {
|
||||||
/// A URL containing the image of the post.
|
/// A URL containing the image of the post.
|
||||||
pub image: Option<String>,
|
pub image: Option<String>,
|
||||||
|
|
Loading…
Reference in a new issue