implement g4 media scraping

This commit is contained in:
Denis-Cosmin Nutiu 2024-12-20 23:12:24 +02:00
parent bcf2b083e7
commit 1212120a91
4 changed files with 137 additions and 5 deletions

View file

@ -5,3 +5,7 @@ edition = "2021"
[dependencies] [dependencies]
env_logger = "0.11.5" env_logger = "0.11.5"
tokio = { version = "1", features = ["full"] }
anyhow = "1.0"
reqwest = "0.12.9"
scraper = "0.22.0"

View file

@ -1,6 +1,16 @@
use crate::scrapper::gfourmedia::G4Media;
use crate::scrapper::WebScrapperEngine;
mod scrapper; mod scrapper;
fn main() { #[tokio::main]
async fn main() -> Result<(), anyhow::Error> {
env_logger::init(); env_logger::init();
println!("Hello, world!"); println!("Hello, world!");
let scrapper = WebScrapperEngine::new(G4Media::default()).await?;
let posts = scrapper.get_posts().await?;
posts.iter().for_each(|p| println!("{:?}", p));
Ok(())
} }

View file

@ -1,17 +1,56 @@
mod gfourmedia; pub(crate) mod gfourmedia;
/// NewsPost represents a news post. /// NewsPost represents a news post.
#[derive(Debug)]
pub struct NewsPost { pub struct NewsPost {
/// A URL containing the image of the post. /// A URL containing the image of the post.
pub image: Option<String>, pub image: Option<String>,
/// The title of the post. /// The title of the post.
pub title: String, pub title: Option<String>,
/// A summary of the post. /// A summary of the post.
pub summary: Option<String>, pub summary: Option<String>,
/// The content of the post. /// The content of the post.
pub content: Option<String>, pub content: Option<String>,
/// A link to the post. /// A link to the post.
pub link: String, pub link: Option<String>,
/// The author of the post. /// The author of the post.
pub author: String, pub author: Option<String>,
}
impl NewsPost {
/// Is complete checks if the news post contains the minimum fields.
pub fn is_complete(&self) -> bool {
self.title.is_some() && self.summary.is_some() && self.link.is_some()
}
}
/// Represents a web scrapper which is can be scraped by the engine.
pub(crate) trait ScrappableWebPage {
fn get_url(&self) -> &str;
fn get_posts(&self, html: String) -> Result<Vec<NewsPost>, anyhow::Error>;
}
/// The web scraper engine is used to scrape web pages.
pub struct WebScrapperEngine<P>
where
P: ScrappableWebPage,
{
web_page: P,
}
impl<P> WebScrapperEngine<P>
where
P: ScrappableWebPage,
{
/// Creates a new instance of WebScrapperEngine
pub async fn new(web_page: P) -> Result<Self, anyhow::Error> {
Ok(WebScrapperEngine { web_page })
}
pub async fn get_posts(&self) -> Result<Vec<NewsPost>, anyhow::Error> {
let body = reqwest::get(self.web_page.get_url()).await?.text().await?;
let results = self.web_page.get_posts(body)?;
Ok(results)
}
} }

View file

@ -1 +1,80 @@
use crate::scrapper::{NewsPost, ScrappableWebPage};
use anyhow::anyhow;
use scraper::{Html, Selector};
#[derive(Debug)]
/// G4 Media website scraper
pub struct G4Media {
url: String,
}
impl Default for G4Media {
fn default() -> Self {
G4Media {
url: String::from("https://www.g4media.ro"),
}
}
}
impl ScrappableWebPage for G4Media {
fn get_url(&self) -> &str {
&self.url
}
fn get_posts(&self, html: String) -> Result<Vec<NewsPost>, anyhow::Error> {
let document = Html::parse_document(&html);
let mut posts: Vec<NewsPost> = vec![];
let posts_selector =
Selector::parse(".post-review").map_err(|_e| anyhow!("failed to make selector"))?;
let anchor_selector =
Selector::parse("a").map_err(|_e| anyhow!("failed to make selector"))?;
let post_img_selector = Selector::parse(".post-img > a > img")
.map_err(|_e| anyhow!("failed to make selector"))?;
let post_title_selector =
Selector::parse(".post-title").map_err(|_e| anyhow!("failed to make selector"))?;
let post_summary_selector =
Selector::parse(".post-content p").map_err(|_e| anyhow!("failed to make selector"))?;
let post_metadata_author_selector = Selector::parse(".post-medatada .entry-author a")
.map_err(|_e| anyhow!("failed to make selector"))?;
let selected_posts = document.select(&posts_selector);
for element in selected_posts {
let mut news_post = NewsPost {
image: None,
title: None,
summary: None,
content: None,
link: None,
author: None,
};
if let Some(selected_post_title) = element.select(&post_title_selector).next() {
if let Some(post_link) = selected_post_title.select(&anchor_selector).next() {
if let Some(href) = post_link.value().attr("href") {
news_post.link = Some(href.to_owned());
}
if let Some(title) = post_link.value().attr("title") {
news_post.title = Some(title.to_owned())
}
}
}
if let Some(selected_summary) = element.select(&post_summary_selector).next() {
news_post.summary = Some(selected_summary.inner_html().trim().replace("&nbsp;", ""))
}
if let Some(selected_author) = element.select(&post_metadata_author_selector).next() {
news_post.author = Some(selected_author.inner_html());
}
if let Some(selected_image) = element.select(&post_img_selector).next() {
if let Some(image_source) = selected_image.attr("data-src") {
news_post.image = Some(image_source.to_string());
}
}
posts.push(news_post);
}
Ok(posts)
}
}