implement g4 media scraping
This commit is contained in:
parent
bcf2b083e7
commit
1212120a91
4 changed files with 137 additions and 5 deletions
|
@ -5,3 +5,7 @@ edition = "2021"
|
|||
|
||||
[dependencies]
|
||||
env_logger = "0.11.5"
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
anyhow = "1.0"
|
||||
reqwest = "0.12.9"
|
||||
scraper = "0.22.0"
|
||||
|
|
12
src/main.rs
12
src/main.rs
|
@ -1,6 +1,16 @@
|
|||
use crate::scrapper::gfourmedia::G4Media;
|
||||
use crate::scrapper::WebScrapperEngine;
|
||||
mod scrapper;
|
||||
|
||||
fn main() {
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), anyhow::Error> {
|
||||
env_logger::init();
|
||||
println!("Hello, world!");
|
||||
|
||||
let scrapper = WebScrapperEngine::new(G4Media::default()).await?;
|
||||
let posts = scrapper.get_posts().await?;
|
||||
|
||||
posts.iter().for_each(|p| println!("{:?}", p));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
@ -1,17 +1,56 @@
|
|||
mod gfourmedia;
|
||||
pub(crate) mod gfourmedia;
|
||||
|
||||
/// NewsPost represents a news post.
|
||||
#[derive(Debug)]
|
||||
pub struct NewsPost {
|
||||
/// A URL containing the image of the post.
|
||||
pub image: Option<String>,
|
||||
/// The title of the post.
|
||||
pub title: String,
|
||||
pub title: Option<String>,
|
||||
/// A summary of the post.
|
||||
pub summary: Option<String>,
|
||||
/// The content of the post.
|
||||
pub content: Option<String>,
|
||||
/// A link to the post.
|
||||
pub link: String,
|
||||
pub link: Option<String>,
|
||||
/// The author of the post.
|
||||
pub author: String,
|
||||
pub author: Option<String>,
|
||||
}
|
||||
|
||||
impl NewsPost {
|
||||
/// Is complete checks if the news post contains the minimum fields.
|
||||
pub fn is_complete(&self) -> bool {
|
||||
self.title.is_some() && self.summary.is_some() && self.link.is_some()
|
||||
}
|
||||
}
|
||||
|
||||
/// Represents a web scrapper which is can be scraped by the engine.
|
||||
pub(crate) trait ScrappableWebPage {
|
||||
fn get_url(&self) -> &str;
|
||||
fn get_posts(&self, html: String) -> Result<Vec<NewsPost>, anyhow::Error>;
|
||||
}
|
||||
|
||||
/// The web scraper engine is used to scrape web pages.
|
||||
pub struct WebScrapperEngine<P>
|
||||
where
|
||||
P: ScrappableWebPage,
|
||||
{
|
||||
web_page: P,
|
||||
}
|
||||
|
||||
impl<P> WebScrapperEngine<P>
|
||||
where
|
||||
P: ScrappableWebPage,
|
||||
{
|
||||
/// Creates a new instance of WebScrapperEngine
|
||||
pub async fn new(web_page: P) -> Result<Self, anyhow::Error> {
|
||||
Ok(WebScrapperEngine { web_page })
|
||||
}
|
||||
|
||||
pub async fn get_posts(&self) -> Result<Vec<NewsPost>, anyhow::Error> {
|
||||
let body = reqwest::get(self.web_page.get_url()).await?.text().await?;
|
||||
|
||||
let results = self.web_page.get_posts(body)?;
|
||||
Ok(results)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1 +1,80 @@
|
|||
use crate::scrapper::{NewsPost, ScrappableWebPage};
|
||||
use anyhow::anyhow;
|
||||
use scraper::{Html, Selector};
|
||||
|
||||
#[derive(Debug)]
|
||||
/// G4 Media website scraper
|
||||
pub struct G4Media {
|
||||
url: String,
|
||||
}
|
||||
|
||||
impl Default for G4Media {
|
||||
fn default() -> Self {
|
||||
G4Media {
|
||||
url: String::from("https://www.g4media.ro"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ScrappableWebPage for G4Media {
|
||||
fn get_url(&self) -> &str {
|
||||
&self.url
|
||||
}
|
||||
|
||||
fn get_posts(&self, html: String) -> Result<Vec<NewsPost>, anyhow::Error> {
|
||||
let document = Html::parse_document(&html);
|
||||
let mut posts: Vec<NewsPost> = vec![];
|
||||
let posts_selector =
|
||||
Selector::parse(".post-review").map_err(|_e| anyhow!("failed to make selector"))?;
|
||||
|
||||
let anchor_selector =
|
||||
Selector::parse("a").map_err(|_e| anyhow!("failed to make selector"))?;
|
||||
let post_img_selector = Selector::parse(".post-img > a > img")
|
||||
.map_err(|_e| anyhow!("failed to make selector"))?;
|
||||
let post_title_selector =
|
||||
Selector::parse(".post-title").map_err(|_e| anyhow!("failed to make selector"))?;
|
||||
let post_summary_selector =
|
||||
Selector::parse(".post-content p").map_err(|_e| anyhow!("failed to make selector"))?;
|
||||
let post_metadata_author_selector = Selector::parse(".post-medatada .entry-author a")
|
||||
.map_err(|_e| anyhow!("failed to make selector"))?;
|
||||
|
||||
let selected_posts = document.select(&posts_selector);
|
||||
|
||||
for element in selected_posts {
|
||||
let mut news_post = NewsPost {
|
||||
image: None,
|
||||
title: None,
|
||||
summary: None,
|
||||
content: None,
|
||||
link: None,
|
||||
author: None,
|
||||
};
|
||||
|
||||
if let Some(selected_post_title) = element.select(&post_title_selector).next() {
|
||||
if let Some(post_link) = selected_post_title.select(&anchor_selector).next() {
|
||||
if let Some(href) = post_link.value().attr("href") {
|
||||
news_post.link = Some(href.to_owned());
|
||||
}
|
||||
if let Some(title) = post_link.value().attr("title") {
|
||||
news_post.title = Some(title.to_owned())
|
||||
}
|
||||
}
|
||||
}
|
||||
if let Some(selected_summary) = element.select(&post_summary_selector).next() {
|
||||
news_post.summary = Some(selected_summary.inner_html().trim().replace(" ", ""))
|
||||
}
|
||||
if let Some(selected_author) = element.select(&post_metadata_author_selector).next() {
|
||||
news_post.author = Some(selected_author.inner_html());
|
||||
}
|
||||
if let Some(selected_image) = element.select(&post_img_selector).next() {
|
||||
if let Some(image_source) = selected_image.attr("data-src") {
|
||||
news_post.image = Some(image_source.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
posts.push(news_post);
|
||||
}
|
||||
|
||||
Ok(posts)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue