init
This commit is contained in:
commit
da40e48b19
20 changed files with 4150 additions and 0 deletions
14
src/sources/mod.rs
Normal file
14
src/sources/mod.rs
Normal file
|
@ -0,0 +1,14 @@
|
|||
use anyhow::Result;
|
||||
|
||||
mod wikipedia;
|
||||
|
||||
pub trait Source {
|
||||
/// Return the URL to query
|
||||
fn url(&self) -> String;
|
||||
/// Given the content of the url figure out the date of the latest leak
|
||||
fn latest_leak(&self, html: String) -> Result<time::Date>;
|
||||
}
|
||||
|
||||
pub fn sources() -> Vec<Box<dyn Source + Send>> {
|
||||
vec![Box::new(wikipedia::Wikipedia)]
|
||||
}
|
115
src/sources/wikipedia/mod.rs
Normal file
115
src/sources/wikipedia/mod.rs
Normal file
|
@ -0,0 +1,115 @@
|
|||
use std::{str::FromStr, time::Instant};
|
||||
|
||||
use super::Source;
|
||||
use anyhow::{bail, Context, Result};
|
||||
use regex::Regex;
|
||||
use soup::{NodeExt, QueryBuilderExt};
|
||||
|
||||
pub struct Wikipedia;
|
||||
|
||||
impl Source for Wikipedia {
|
||||
fn url(&self) -> String {
|
||||
"https://en.wikipedia.org/wiki/War_Thunder".to_string()
|
||||
}
|
||||
|
||||
fn latest_leak(&self, html: String) -> Result<time::Date> {
|
||||
let soup = soup::Soup::new(&html);
|
||||
|
||||
let tables = soup.tag("table").find_all();
|
||||
|
||||
let tables_with_classified = tables
|
||||
.into_iter()
|
||||
.filter(|t| t.text().contains("Classified"))
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let table = match &tables_with_classified[..] {
|
||||
[table] => table,
|
||||
_ => bail!("Cannot reliably find leaks table"),
|
||||
};
|
||||
|
||||
let lines: Vec<String> = table
|
||||
.tag("tbody")
|
||||
.find()
|
||||
.context("Could not find table body")?
|
||||
.tag("tr")
|
||||
.find_all()
|
||||
.flat_map(|line| line.tag("td").find())
|
||||
.map(|td| td.text())
|
||||
.collect();
|
||||
|
||||
lines
|
||||
.iter()
|
||||
.flat_map(|txt| parse_wikipedia_date(txt))
|
||||
.max()
|
||||
.context("Could not find any date?")
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_wikipedia_date(text: &str) -> Result<time::Date> {
|
||||
let full_regex = Regex::new(r"(\w+)\s+(\d+),?\s+(\d+)").unwrap();
|
||||
|
||||
if let Some(cap) = full_regex.captures(text) {
|
||||
let (_, [month, day, year]) = cap.extract();
|
||||
|
||||
let month = time::Month::from_str(month);
|
||||
|
||||
return time::Date::from_calendar_date(
|
||||
year.parse().context("Failed to parse year")?,
|
||||
month.context("Failed to parse month")?,
|
||||
day.parse().context("Failed to parse day")?,
|
||||
)
|
||||
.context("Failed to create date from provided text");
|
||||
}
|
||||
|
||||
let small_regex = Regex::new(r"(\w+) (\d+)").unwrap();
|
||||
if let Some(cap) = small_regex.captures(text) {
|
||||
let (_, [month, year]) = cap.extract();
|
||||
|
||||
let month = time::Month::from_str(month);
|
||||
|
||||
return time::Date::from_calendar_date(
|
||||
year.parse().context("Failed to parse year")?,
|
||||
month.context("Failed to parse month")?,
|
||||
1,
|
||||
)
|
||||
.context("Failed to create date from provided text");
|
||||
}
|
||||
|
||||
bail!("Failed to parse wikipedia date")
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wikipedia_html_parse() {
|
||||
let html = std::fs::read_to_string("./data/wikipedia.html").unwrap();
|
||||
|
||||
let real = Wikipedia.latest_leak(html).unwrap();
|
||||
let expected = time::Date::from_calendar_date(2023, time::Month::December, 12).unwrap();
|
||||
|
||||
assert_eq!(expected, real);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wikipedia_date_parse() {
|
||||
assert!(parse_wikipedia_date("testing 123, 1234").is_err());
|
||||
assert_eq!(
|
||||
parse_wikipedia_date("July 14, 2021").unwrap(),
|
||||
time::Date::from_calendar_date(2021, time::Month::July, 14).unwrap()
|
||||
);
|
||||
assert_eq!(
|
||||
parse_wikipedia_date(" July 14, 2021 ").unwrap(),
|
||||
time::Date::from_calendar_date(2021, time::Month::July, 14).unwrap()
|
||||
);
|
||||
assert_eq!(
|
||||
parse_wikipedia_date("July 14 2021").unwrap(),
|
||||
time::Date::from_calendar_date(2021, time::Month::July, 14).unwrap()
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
parse_wikipedia_date("October 2021").unwrap(),
|
||||
time::Date::from_calendar_date(2021, time::Month::October, 1).unwrap()
|
||||
);
|
||||
assert_eq!(
|
||||
parse_wikipedia_date("october 2021").unwrap(),
|
||||
time::Date::from_calendar_date(2021, time::Month::October, 1).unwrap()
|
||||
);
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue