diff --git a/Cargo.toml b/Cargo.toml index be98d4d..4f51417 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,3 +5,7 @@ edition = "2021" [dependencies] env_logger = "0.11.5" +tokio = { version = "1", features = ["full"] } +anyhow = "1.0" +reqwest = "0.12.9" +scraper = "0.22.0" diff --git a/src/main.rs b/src/main.rs index ec735b2..34d13bb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,16 @@ +use crate::scrapper::gfourmedia::G4Media; +use crate::scrapper::WebScrapperEngine; mod scrapper; -fn main() { +#[tokio::main] +async fn main() -> Result<(), anyhow::Error> { env_logger::init(); println!("Hello, world!"); + + let scrapper = WebScrapperEngine::new(G4Media::default()).await?; + let posts = scrapper.get_posts().await?; + + posts.iter().for_each(|p| println!("{:?}", p)); + + Ok(()) } diff --git a/src/scrapper.rs b/src/scrapper.rs index 22dd999..94be1f9 100644 --- a/src/scrapper.rs +++ b/src/scrapper.rs @@ -1,17 +1,56 @@ -mod gfourmedia; +pub(crate) mod gfourmedia; /// NewsPost represents a news post. +#[derive(Debug)] pub struct NewsPost { /// A URL containing the image of the post. pub image: Option, /// The title of the post. - pub title: String, + pub title: Option, /// A summary of the post. pub summary: Option, /// The content of the post. pub content: Option, /// A link to the post. - pub link: String, + pub link: Option, /// The author of the post. - pub author: String, + pub author: Option, +} + +impl NewsPost { + /// Is complete checks if the news post contains the minimum fields. + pub fn is_complete(&self) -> bool { + self.title.is_some() && self.summary.is_some() && self.link.is_some() + } +} + +/// Represents a web scrapper which is can be scraped by the engine. +pub(crate) trait ScrappableWebPage { + fn get_url(&self) -> &str; + fn get_posts(&self, html: String) -> Result, anyhow::Error>; +} + +/// The web scraper engine is used to scrape web pages. +pub struct WebScrapperEngine

+where + P: ScrappableWebPage, +{ + web_page: P, +} + +impl

WebScrapperEngine

+where + P: ScrappableWebPage, +{ + /// Creates a new instance of WebScrapperEngine + pub async fn new(web_page: P) -> Result { + Ok(WebScrapperEngine { web_page }) + } + + pub async fn get_posts(&self) -> Result, anyhow::Error> { + let body = reqwest::get(self.web_page.get_url()).await?.text().await?; + + let results = self.web_page.get_posts(body)?; + Ok(results) + } } diff --git a/src/scrapper/gfourmedia.rs b/src/scrapper/gfourmedia.rs index 8b13789..d3efdac 100644 --- a/src/scrapper/gfourmedia.rs +++ b/src/scrapper/gfourmedia.rs @@ -1 +1,80 @@ +use crate::scrapper::{NewsPost, ScrappableWebPage}; +use anyhow::anyhow; +use scraper::{Html, Selector}; +#[derive(Debug)] +/// G4 Media website scraper +pub struct G4Media { + url: String, +} + +impl Default for G4Media { + fn default() -> Self { + G4Media { + url: String::from("https://www.g4media.ro"), + } + } +} + +impl ScrappableWebPage for G4Media { + fn get_url(&self) -> &str { + &self.url + } + + fn get_posts(&self, html: String) -> Result, anyhow::Error> { + let document = Html::parse_document(&html); + let mut posts: Vec = vec![]; + let posts_selector = + Selector::parse(".post-review").map_err(|_e| anyhow!("failed to make selector"))?; + + let anchor_selector = + Selector::parse("a").map_err(|_e| anyhow!("failed to make selector"))?; + let post_img_selector = Selector::parse(".post-img > a > img") + .map_err(|_e| anyhow!("failed to make selector"))?; + let post_title_selector = + Selector::parse(".post-title").map_err(|_e| anyhow!("failed to make selector"))?; + let post_summary_selector = + Selector::parse(".post-content p").map_err(|_e| anyhow!("failed to make selector"))?; + let post_metadata_author_selector = Selector::parse(".post-medatada .entry-author a") + .map_err(|_e| anyhow!("failed to make selector"))?; + + let selected_posts = document.select(&posts_selector); + + for element in selected_posts { + let mut news_post = NewsPost { + image: None, + title: None, + summary: None, + content: None, + link: None, + author: None, + }; + + if let Some(selected_post_title) = element.select(&post_title_selector).next() { + if let Some(post_link) = selected_post_title.select(&anchor_selector).next() { + if let Some(href) = post_link.value().attr("href") { + news_post.link = Some(href.to_owned()); + } + if let Some(title) = post_link.value().attr("title") { + news_post.title = Some(title.to_owned()) + } + } + } + if let Some(selected_summary) = element.select(&post_summary_selector).next() { + news_post.summary = Some(selected_summary.inner_html().trim().replace(" ", "")) + } + if let Some(selected_author) = element.select(&post_metadata_author_selector).next() { + news_post.author = Some(selected_author.inner_html()); + } + if let Some(selected_image) = element.select(&post_img_selector).next() { + if let Some(image_source) = selected_image.attr("data-src") { + news_post.image = Some(image_source.to_string()); + } + } + + posts.push(news_post); + } + + Ok(posts) + } +}