implement g4 media scraping

2024-12-20 23:12:24 +02:00 · 2024-12-20 23:12:24 +02:00 · 1212120a91
commit 1212120a91
parent bcf2b083e7
4 changed files with 137 additions and 5 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -5,3 +5,7 @@ edition = "2021"

 [dependencies]
 env_logger = "0.11.5"
+tokio = { version = "1", features = ["full"] }
+anyhow = "1.0"
+reqwest = "0.12.9"
+scraper = "0.22.0"
--- a/src/main.rs
+++ b/src/main.rs
@ -1,6 +1,16 @@
+use crate::scrapper::gfourmedia::G4Media;
+use crate::scrapper::WebScrapperEngine;
 mod scrapper;

-fn main() {
+#[tokio::main]
+async fn main() -> Result<(), anyhow::Error> {
    env_logger::init();
    println!("Hello, world!");
+
+    let scrapper = WebScrapperEngine::new(G4Media::default()).await?;
+    let posts = scrapper.get_posts().await?;
+
+    posts.iter().for_each(|p| println!("{:?}", p));
+
+    Ok(())
 }
--- a/src/scrapper.rs
+++ b/src/scrapper.rs
@ -1,17 +1,56 @@
-mod gfourmedia;
+pub(crate) mod gfourmedia;

 /// NewsPost represents a news post.
+#[derive(Debug)]
 pub struct NewsPost {
    /// A URL containing the image of the post.
    pub image: Option<String>,
    /// The title of the post.
-    pub title: String,
+    pub title: Option<String>,
    /// A summary of the post.
    pub summary: Option<String>,
    /// The content of the post.
    pub content: Option<String>,
    /// A link to the post.
-    pub link: String,
+    pub link: Option<String>,
    /// The author of the post.
-    pub author: String,
+    pub author: Option<String>,
+}
+
+impl NewsPost {
+    /// Is complete checks if the news post contains the minimum fields.
+    pub fn is_complete(&self) -> bool {
+        self.title.is_some() && self.summary.is_some() && self.link.is_some()
+    }
+}
+
+/// Represents a web scrapper which is can be scraped by the engine.
+pub(crate) trait ScrappableWebPage {
+    fn get_url(&self) -> &str;
+    fn get_posts(&self, html: String) -> Result<Vec<NewsPost>, anyhow::Error>;
+}
+
+/// The web scraper engine is used to scrape web pages.
+pub struct WebScrapperEngine<P>
+where
+    P: ScrappableWebPage,
+{
+    web_page: P,
+}
+
+impl<P> WebScrapperEngine<P>
+where
+    P: ScrappableWebPage,
+{
+    /// Creates a new instance of WebScrapperEngine
+    pub async fn new(web_page: P) -> Result<Self, anyhow::Error> {
+        Ok(WebScrapperEngine { web_page })
+    }
+
+    pub async fn get_posts(&self) -> Result<Vec<NewsPost>, anyhow::Error> {
+        let body = reqwest::get(self.web_page.get_url()).await?.text().await?;
+
+        let results = self.web_page.get_posts(body)?;
+        Ok(results)
+    }
 }
--- a/src/scrapper/gfourmedia.rs
+++ b/src/scrapper/gfourmedia.rs
@ -1 +1,80 @@
+use crate::scrapper::{NewsPost, ScrappableWebPage};
+use anyhow::anyhow;
+use scraper::{Html, Selector};

+#[derive(Debug)]
+/// G4 Media website scraper
+pub struct G4Media {
+    url: String,
+}
+
+impl Default for G4Media {
+    fn default() -> Self {
+        G4Media {
+            url: String::from("https://www.g4media.ro"),
+        }
+    }
+}
+
+impl ScrappableWebPage for G4Media {
+    fn get_url(&self) -> &str {
+        &self.url
+    }
+
+    fn get_posts(&self, html: String) -> Result<Vec<NewsPost>, anyhow::Error> {
+        let document = Html::parse_document(&html);
+        let mut posts: Vec<NewsPost> = vec![];
+        let posts_selector =
+            Selector::parse(".post-review").map_err(|_e| anyhow!("failed to make selector"))?;
+
+        let anchor_selector =
+            Selector::parse("a").map_err(|_e| anyhow!("failed to make selector"))?;
+        let post_img_selector = Selector::parse(".post-img > a > img")
+            .map_err(|_e| anyhow!("failed to make selector"))?;
+        let post_title_selector =
+            Selector::parse(".post-title").map_err(|_e| anyhow!("failed to make selector"))?;
+        let post_summary_selector =
+            Selector::parse(".post-content p").map_err(|_e| anyhow!("failed to make selector"))?;
+        let post_metadata_author_selector = Selector::parse(".post-medatada .entry-author a")
+            .map_err(|_e| anyhow!("failed to make selector"))?;
+
+        let selected_posts = document.select(&posts_selector);
+
+        for element in selected_posts {
+            let mut news_post = NewsPost {
+                image: None,
+                title: None,
+                summary: None,
+                content: None,
+                link: None,
+                author: None,
+            };
+
+            if let Some(selected_post_title) = element.select(&post_title_selector).next() {
+                if let Some(post_link) = selected_post_title.select(&anchor_selector).next() {
+                    if let Some(href) = post_link.value().attr("href") {
+                        news_post.link = Some(href.to_owned());
+                    }
+                    if let Some(title) = post_link.value().attr("title") {
+                        news_post.title = Some(title.to_owned())
+                    }
+                }
+            }
+            if let Some(selected_summary) = element.select(&post_summary_selector).next() {
+                news_post.summary = Some(selected_summary.inner_html().trim().replace("&nbsp;", ""))
+            }
+            if let Some(selected_author) = element.select(&post_metadata_author_selector).next() {
+                news_post.author = Some(selected_author.inner_html());
+            }
+            if let Some(selected_image) = element.select(&post_img_selector).next() {
+                if let Some(image_source) = selected_image.attr("data-src") {
+                    news_post.image = Some(image_source.to_string());
+                }
+            }
+
+            posts.push(news_post);
+        }
+
+        Ok(posts)
+    }
+}