This commit is contained in:
lelgenio 2024-06-22 02:54:58 -03:00
commit da40e48b19
20 changed files with 4150 additions and 0 deletions

14
src/sources/mod.rs Normal file
View file

@ -0,0 +1,14 @@
use anyhow::Result;
mod wikipedia;
pub trait Source {
/// Return the URL to query
fn url(&self) -> String;
/// Given the content of the url figure out the date of the latest leak
fn latest_leak(&self, html: String) -> Result<time::Date>;
}
pub fn sources() -> Vec<Box<dyn Source + Send>> {
vec![Box::new(wikipedia::Wikipedia)]
}

View file

@ -0,0 +1,115 @@
use std::{str::FromStr, time::Instant};
use super::Source;
use anyhow::{bail, Context, Result};
use regex::Regex;
use soup::{NodeExt, QueryBuilderExt};
pub struct Wikipedia;
impl Source for Wikipedia {
fn url(&self) -> String {
"https://en.wikipedia.org/wiki/War_Thunder".to_string()
}
fn latest_leak(&self, html: String) -> Result<time::Date> {
let soup = soup::Soup::new(&html);
let tables = soup.tag("table").find_all();
let tables_with_classified = tables
.into_iter()
.filter(|t| t.text().contains("Classified"))
.collect::<Vec<_>>();
let table = match &tables_with_classified[..] {
[table] => table,
_ => bail!("Cannot reliably find leaks table"),
};
let lines: Vec<String> = table
.tag("tbody")
.find()
.context("Could not find table body")?
.tag("tr")
.find_all()
.flat_map(|line| line.tag("td").find())
.map(|td| td.text())
.collect();
lines
.iter()
.flat_map(|txt| parse_wikipedia_date(txt))
.max()
.context("Could not find any date?")
}
}
fn parse_wikipedia_date(text: &str) -> Result<time::Date> {
let full_regex = Regex::new(r"(\w+)\s+(\d+),?\s+(\d+)").unwrap();
if let Some(cap) = full_regex.captures(text) {
let (_, [month, day, year]) = cap.extract();
let month = time::Month::from_str(month);
return time::Date::from_calendar_date(
year.parse().context("Failed to parse year")?,
month.context("Failed to parse month")?,
day.parse().context("Failed to parse day")?,
)
.context("Failed to create date from provided text");
}
let small_regex = Regex::new(r"(\w+) (\d+)").unwrap();
if let Some(cap) = small_regex.captures(text) {
let (_, [month, year]) = cap.extract();
let month = time::Month::from_str(month);
return time::Date::from_calendar_date(
year.parse().context("Failed to parse year")?,
month.context("Failed to parse month")?,
1,
)
.context("Failed to create date from provided text");
}
bail!("Failed to parse wikipedia date")
}
#[test]
fn test_wikipedia_html_parse() {
let html = std::fs::read_to_string("./data/wikipedia.html").unwrap();
let real = Wikipedia.latest_leak(html).unwrap();
let expected = time::Date::from_calendar_date(2023, time::Month::December, 12).unwrap();
assert_eq!(expected, real);
}
#[test]
fn test_wikipedia_date_parse() {
assert!(parse_wikipedia_date("testing 123, 1234").is_err());
assert_eq!(
parse_wikipedia_date("July 14, 2021").unwrap(),
time::Date::from_calendar_date(2021, time::Month::July, 14).unwrap()
);
assert_eq!(
parse_wikipedia_date(" July 14, 2021 ").unwrap(),
time::Date::from_calendar_date(2021, time::Month::July, 14).unwrap()
);
assert_eq!(
parse_wikipedia_date("July 14 2021").unwrap(),
time::Date::from_calendar_date(2021, time::Month::July, 14).unwrap()
);
assert_eq!(
parse_wikipedia_date("October 2021").unwrap(),
time::Date::from_calendar_date(2021, time::Month::October, 1).unwrap()
);
assert_eq!(
parse_wikipedia_date("october 2021").unwrap(),
time::Date::from_calendar_date(2021, time::Month::October, 1).unwrap()
);
}