dao-governance-framework/semantic-scholar-client/src/bin/import.rs

147 lines
3.9 KiB
Rust

// During development, allowing dead code
#![allow(dead_code)]
use async_recursion::async_recursion;
use clap::Parser;
use std::cmp::min;
use std::fmt::Write;
use std::error::Error;
use serde::Deserialize;
type DataResult<T> = Result<T, Box<dyn Error>>;
const BASE_URL: &str = "https://api.semanticscholar.org/graph/v1";
const MAX_DEPTH: u32 = 3;
#[derive(Parser, Debug)]
#[clap(author, version, about, long_about = None)]
struct Args {
/// How deep to traverse citation graph from the starting paper
#[clap(short, long, value_parser)]
depth: u32,
/// Starting paper. We will traverse papers that cite this one
#[clap(short, long, value_parser)]
paper_id: String,
// Write the results to MongoDB
// #[clap(short, long, value_parser)]
// write_to_mongo: bool,
}
struct Author {
name: String
}
type Authors = Vec<Author>;
#[derive(Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
struct Paper {
paper_id: String,
title: Option<String>,
citations: Vec<Citation>,
}
/**
* Occurs within Citation struct
*/
#[derive(Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
struct CitingPaper {
paper_id: Option<String>,
title: Option<String>,
}
#[derive(Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
struct Citation {
citing_paper: CitingPaper
}
/**
* Generic struct to wrap the common API response pattern {data: [...]}
*/
#[derive(Deserialize, Debug)]
struct ApiListResponse<T> {
data: Vec<T>
}
// TODO: Cache results in a (separate but local) database such as Redis
// TODO: Store results in a (separate but local) database such as Postgres
#[async_recursion]
async fn get_citations(paper_id: String, depth: u32, authors: &mut Vec<Author>) -> DataResult<Vec<Citation>> {
// Bound recursion to some depth
if depth > MAX_DEPTH {
return Ok(vec![]);
}
// Build the URL
let mut url = String::new();
write!(&mut url, "{}/paper/{}/citations", BASE_URL, paper_id)?;
let resp = reqwest::get(url)
.await?
.text()
.await?;
let resp_deserialized_attempt = serde_json::from_str::<ApiListResponse<Citation>>(resp.as_str());
if let Err(err) = resp_deserialized_attempt {
println!("depth {} paper {} error {}", depth, paper_id, err);
return Ok(vec![]);
}
let resp_deserialized: ApiListResponse<Citation> = resp_deserialized_attempt.unwrap();
for Citation{citing_paper: CitingPaper{paper_id: citing_paper_id, title}} in resp_deserialized.data {
if let (Some(citing_paper_id), Some(title)) = (citing_paper_id, title) {
let short_len = min(50, title.len());
let (short_title, _) = title.split_at(short_len);
println!("depth {} paper {} cites {} title {}", depth, citing_paper_id, paper_id, short_title);
get_citations(citing_paper_id, depth + 1, authors).await?;
}
}
Ok(vec![])
}
async fn get_paper_info(paper_id: String, depth: u32, authors: &mut Authors) -> DataResult<Vec<Paper>> {
// Build the URL
let mut url = String::new();
// Probably also want: year,publicationDate,journal", BASE_URL, paper_id)?;
const fields: &str = "title, authors, citations";
write!(&mut url, "{}/paper/{}?fields={}", BASE_URL, paper_id, fields)?;
let resp = reqwest::get(url)
.await?
.text()
.await?;
let resp_deserialized_attempt = serde_json::from_str::<ApiListResponse<Paper>>(resp.as_str());
if let Err(err) = resp_deserialized_attempt {
println!("depth {} paper {} error {}", depth, paper_id, err);
return Ok(vec![]);
}
let resp_deserialized: ApiListResponse<Paper> = resp_deserialized_attempt.unwrap();
Ok(vec![])
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn Error>> {
let Args{
depth,
paper_id,
// write_to_mongo,
} = Args::parse();
let mut authors = Authors::new();
get_citations(paper_id, depth, &mut authors).await?;
Ok(())
}