implement g4 media scraping
This commit is contained in:
parent
bcf2b083e7
commit
1212120a91
4 changed files with 137 additions and 5 deletions
|
@ -5,3 +5,7 @@ edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
env_logger = "0.11.5"
|
env_logger = "0.11.5"
|
||||||
|
tokio = { version = "1", features = ["full"] }
|
||||||
|
anyhow = "1.0"
|
||||||
|
reqwest = "0.12.9"
|
||||||
|
scraper = "0.22.0"
|
||||||
|
|
12
src/main.rs
12
src/main.rs
|
@ -1,6 +1,16 @@
|
||||||
|
use crate::scrapper::gfourmedia::G4Media;
|
||||||
|
use crate::scrapper::WebScrapperEngine;
|
||||||
mod scrapper;
|
mod scrapper;
|
||||||
|
|
||||||
fn main() {
|
#[tokio::main]
|
||||||
|
async fn main() -> Result<(), anyhow::Error> {
|
||||||
env_logger::init();
|
env_logger::init();
|
||||||
println!("Hello, world!");
|
println!("Hello, world!");
|
||||||
|
|
||||||
|
let scrapper = WebScrapperEngine::new(G4Media::default()).await?;
|
||||||
|
let posts = scrapper.get_posts().await?;
|
||||||
|
|
||||||
|
posts.iter().for_each(|p| println!("{:?}", p));
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,17 +1,56 @@
|
||||||
mod gfourmedia;
|
pub(crate) mod gfourmedia;
|
||||||
|
|
||||||
/// NewsPost represents a news post.
|
/// NewsPost represents a news post.
|
||||||
|
#[derive(Debug)]
|
||||||
pub struct NewsPost {
|
pub struct NewsPost {
|
||||||
/// A URL containing the image of the post.
|
/// A URL containing the image of the post.
|
||||||
pub image: Option<String>,
|
pub image: Option<String>,
|
||||||
/// The title of the post.
|
/// The title of the post.
|
||||||
pub title: String,
|
pub title: Option<String>,
|
||||||
/// A summary of the post.
|
/// A summary of the post.
|
||||||
pub summary: Option<String>,
|
pub summary: Option<String>,
|
||||||
/// The content of the post.
|
/// The content of the post.
|
||||||
pub content: Option<String>,
|
pub content: Option<String>,
|
||||||
/// A link to the post.
|
/// A link to the post.
|
||||||
pub link: String,
|
pub link: Option<String>,
|
||||||
/// The author of the post.
|
/// The author of the post.
|
||||||
pub author: String,
|
pub author: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl NewsPost {
|
||||||
|
/// Is complete checks if the news post contains the minimum fields.
|
||||||
|
pub fn is_complete(&self) -> bool {
|
||||||
|
self.title.is_some() && self.summary.is_some() && self.link.is_some()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Represents a web scrapper which is can be scraped by the engine.
|
||||||
|
pub(crate) trait ScrappableWebPage {
|
||||||
|
fn get_url(&self) -> &str;
|
||||||
|
fn get_posts(&self, html: String) -> Result<Vec<NewsPost>, anyhow::Error>;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The web scraper engine is used to scrape web pages.
|
||||||
|
pub struct WebScrapperEngine<P>
|
||||||
|
where
|
||||||
|
P: ScrappableWebPage,
|
||||||
|
{
|
||||||
|
web_page: P,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<P> WebScrapperEngine<P>
|
||||||
|
where
|
||||||
|
P: ScrappableWebPage,
|
||||||
|
{
|
||||||
|
/// Creates a new instance of WebScrapperEngine
|
||||||
|
pub async fn new(web_page: P) -> Result<Self, anyhow::Error> {
|
||||||
|
Ok(WebScrapperEngine { web_page })
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn get_posts(&self) -> Result<Vec<NewsPost>, anyhow::Error> {
|
||||||
|
let body = reqwest::get(self.web_page.get_url()).await?.text().await?;
|
||||||
|
|
||||||
|
let results = self.web_page.get_posts(body)?;
|
||||||
|
Ok(results)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1 +1,80 @@
|
||||||
|
use crate::scrapper::{NewsPost, ScrappableWebPage};
|
||||||
|
use anyhow::anyhow;
|
||||||
|
use scraper::{Html, Selector};
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
/// G4 Media website scraper
|
||||||
|
pub struct G4Media {
|
||||||
|
url: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for G4Media {
|
||||||
|
fn default() -> Self {
|
||||||
|
G4Media {
|
||||||
|
url: String::from("https://www.g4media.ro"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ScrappableWebPage for G4Media {
|
||||||
|
fn get_url(&self) -> &str {
|
||||||
|
&self.url
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_posts(&self, html: String) -> Result<Vec<NewsPost>, anyhow::Error> {
|
||||||
|
let document = Html::parse_document(&html);
|
||||||
|
let mut posts: Vec<NewsPost> = vec![];
|
||||||
|
let posts_selector =
|
||||||
|
Selector::parse(".post-review").map_err(|_e| anyhow!("failed to make selector"))?;
|
||||||
|
|
||||||
|
let anchor_selector =
|
||||||
|
Selector::parse("a").map_err(|_e| anyhow!("failed to make selector"))?;
|
||||||
|
let post_img_selector = Selector::parse(".post-img > a > img")
|
||||||
|
.map_err(|_e| anyhow!("failed to make selector"))?;
|
||||||
|
let post_title_selector =
|
||||||
|
Selector::parse(".post-title").map_err(|_e| anyhow!("failed to make selector"))?;
|
||||||
|
let post_summary_selector =
|
||||||
|
Selector::parse(".post-content p").map_err(|_e| anyhow!("failed to make selector"))?;
|
||||||
|
let post_metadata_author_selector = Selector::parse(".post-medatada .entry-author a")
|
||||||
|
.map_err(|_e| anyhow!("failed to make selector"))?;
|
||||||
|
|
||||||
|
let selected_posts = document.select(&posts_selector);
|
||||||
|
|
||||||
|
for element in selected_posts {
|
||||||
|
let mut news_post = NewsPost {
|
||||||
|
image: None,
|
||||||
|
title: None,
|
||||||
|
summary: None,
|
||||||
|
content: None,
|
||||||
|
link: None,
|
||||||
|
author: None,
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(selected_post_title) = element.select(&post_title_selector).next() {
|
||||||
|
if let Some(post_link) = selected_post_title.select(&anchor_selector).next() {
|
||||||
|
if let Some(href) = post_link.value().attr("href") {
|
||||||
|
news_post.link = Some(href.to_owned());
|
||||||
|
}
|
||||||
|
if let Some(title) = post_link.value().attr("title") {
|
||||||
|
news_post.title = Some(title.to_owned())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if let Some(selected_summary) = element.select(&post_summary_selector).next() {
|
||||||
|
news_post.summary = Some(selected_summary.inner_html().trim().replace(" ", ""))
|
||||||
|
}
|
||||||
|
if let Some(selected_author) = element.select(&post_metadata_author_selector).next() {
|
||||||
|
news_post.author = Some(selected_author.inner_html());
|
||||||
|
}
|
||||||
|
if let Some(selected_image) = element.select(&post_img_selector).next() {
|
||||||
|
if let Some(image_source) = selected_image.attr("data-src") {
|
||||||
|
news_post.image = Some(image_source.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
posts.push(news_post);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(posts)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue