Restructure shared types and db columns

Now, the server sends the runner pretty much all run metadata. This way,
the reservation the server makes for the runner is accurate, providing
the runner responds with the same metadata it was sent. It also means
that only the server's system clock is relevant for tie breakers, and a
run's duration spans from the moment it is reserved to the moment it is
saved.

Also, the bench method is now always called `bench_method` and a
human-readable description is stored in the database for each run.
This commit is contained in:
Joscha 2023-08-12 19:15:38 +02:00
parent 53be0338f2
commit c7a89867a7
8 changed files with 121 additions and 98 deletions

View file

@ -1,12 +0,0 @@
{
"db_name": "SQLite",
"query": "INSERT INTO runs ( id, hash, start, end, exit_code ) VALUES (?, ?, ?, ?, ?) ",
"describe": {
"columns": [],
"parameters": {
"Right": 5
},
"nullable": []
},
"hash": "40f965913a8a3ec16da66dd79c12710279ba817bc5f50661f592371297efd651"
}

View file

@ -0,0 +1,12 @@
{
"db_name": "SQLite",
"query": "INSERT INTO runs ( id, hash, bench_method, start, end, exit_code ) VALUES (?, ?, ?, ?, ?, ?) ",
"describe": {
"columns": [],
"parameters": {
"Right": 6
},
"nullable": []
},
"hash": "d1e7da8b6a2018e621e3fd6d7a74668a82fabb1d83bdfc8bf763bff733b3388c"
}

View file

@ -1,9 +1,10 @@
CREATE TABLE runs ( CREATE TABLE runs (
id TEXT NOT NULL PRIMARY KEY, id TEXT NOT NULL PRIMARY KEY,
hash TEXT NOT NULL, hash TEXT NOT NULL,
start TEXT NOT NULL, bench_method TEXT NOT NULL,
end TEXT NOT NULL, start TEXT NOT NULL,
exit_code INT NOT NULL, end TEXT NOT NULL,
exit_code INT NOT NULL,
FOREIGN KEY (hash) REFERENCES commits (hash) ON DELETE CASCADE FOREIGN KEY (hash) REFERENCES commits (hash) ON DELETE CASCADE
) STRICT; ) STRICT;

View file

@ -28,31 +28,39 @@ use crate::{
somehow, somehow,
}; };
async fn save_work(run: FinishedRun, db: &SqlitePool) -> somehow::Result<()> { async fn save_work(finished: FinishedRun, db: &SqlitePool) -> somehow::Result<()> {
let mut tx = db.begin().await?; let mut tx = db.begin().await?;
let conn = tx.acquire().await?; let conn = tx.acquire().await?;
let end = OffsetDateTime::now_utc();
let bench_method = match finished.run.bench_method {
BenchMethod::Internal => "internal".to_string(),
BenchMethod::Repo { hash } => format!("bench repo, hash {hash}"),
};
sqlx::query!( sqlx::query!(
"\ "\
INSERT INTO runs ( \ INSERT INTO runs ( \
id, \ id, \
hash, \ hash, \
bench_method, \
start, \ start, \
end, \ end, \
exit_code \ exit_code \
) \ ) \
VALUES (?, ?, ?, ?, ?) \ VALUES (?, ?, ?, ?, ?, ?) \
", ",
run.id, finished.run.id,
run.hash, finished.run.hash,
run.start, bench_method,
run.end, finished.run.start,
run.exit_code, end,
finished.exit_code,
) )
.execute(&mut *conn) .execute(&mut *conn)
.await?; .await?;
for (name, measurement) in run.measurements { for (name, measurement) in finished.measurements {
sqlx::query!( sqlx::query!(
"\ "\
INSERT INTO run_measurements ( \ INSERT INTO run_measurements ( \
@ -65,7 +73,7 @@ async fn save_work(run: FinishedRun, db: &SqlitePool) -> somehow::Result<()> {
) \ ) \
VALUES (?, ?, ?, ?, ?, ?) \ VALUES (?, ?, ?, ?, ?, ?) \
", ",
run.id, finished.run.id,
name, name,
measurement.value, measurement.value,
measurement.stddev, measurement.stddev,
@ -76,7 +84,7 @@ async fn save_work(run: FinishedRun, db: &SqlitePool) -> somehow::Result<()> {
.await?; .await?;
} }
for (idx, (source, text)) in run.output.into_iter().enumerate() { for (idx, (source, text)) in finished.output.into_iter().enumerate() {
// Hopefully we won't need more than 4294967296 output chunks per run :P // Hopefully we won't need more than 4294967296 output chunks per run :P
let idx = idx as u32; let idx = idx as u32;
sqlx::query!( sqlx::query!(
@ -89,7 +97,7 @@ async fn save_work(run: FinishedRun, db: &SqlitePool) -> somehow::Result<()> {
) \ ) \
VALUES (?, ?, ?, ?) \ VALUES (?, ?, ?, ?) \
", ",
run.id, finished.run.id,
idx, idx,
source, source,
text, text,
@ -99,7 +107,7 @@ async fn save_work(run: FinishedRun, db: &SqlitePool) -> somehow::Result<()> {
} }
// The thing has been done :D // The thing has been done :D
sqlx::query!("DELETE FROM queue WHERE hash = ?", run.hash) sqlx::query!("DELETE FROM queue WHERE hash = ?", finished.run.hash)
.execute(&mut *conn) .execute(&mut *conn)
.await?; .await?;
@ -120,7 +128,7 @@ async fn post_status(
Err(response) => return Ok(response), Err(response) => return Ok(response),
}; };
if let Some(run) = request.submit_work { if let Some(run) = request.submit_run {
save_work(run, &db).await?; save_work(run, &db).await?;
} }
@ -153,8 +161,8 @@ async fn post_status(
name.clone(), name.clone(),
WorkerInfo::new(request.secret, OffsetDateTime::now_utc(), request.status), WorkerInfo::new(request.secret, OffsetDateTime::now_utc(), request.status),
); );
let work = match request.request_work { let work = match request.request_run {
true => guard.find_work(&name, &queue, bench_method), true => guard.find_and_reserve_run(&name, &queue, bench_method),
false => None, false => None,
}; };
let abort_work = guard.should_abort_work(&name, &queue); let abort_work = guard.should_abort_work(&name, &queue);
@ -162,7 +170,11 @@ async fn post_status(
}; };
debug!("Received status update from {name}"); debug!("Received status update from {name}");
Ok(Json(ServerResponse { work, abort_work }).into_response()) Ok(Json(ServerResponse {
run: work,
abort_run: abort_work,
})
.into_response())
} }
fn stream_response(repo: Arc<ThreadSafeRepository>, id: ObjectId) -> impl IntoResponse { fn stream_response(repo: Arc<ThreadSafeRepository>, id: ObjectId) -> impl IntoResponse {

View file

@ -64,12 +64,19 @@ async fn get_workers(
let status = match &info.status { let status = match &info.status {
WorkerStatus::Idle => Status::Idle, WorkerStatus::Idle => Status::Idle,
WorkerStatus::Busy => Status::Busy, WorkerStatus::Busy => Status::Busy,
WorkerStatus::Working(run) => { WorkerStatus::Working(unfinished) => {
let message = let message = sqlx::query_scalar!(
sqlx::query_scalar!("SELECT message FROM commits WHERE hash = ?", run.hash) "SELECT message FROM commits WHERE hash = ?",
.fetch_one(db) unfinished.run.hash
.await?; )
Status::Working(RunLink::new(base, run.id.clone(), &run.hash, &message)) .fetch_one(db)
.await?;
Status::Working(RunLink::new(
base,
unfinished.run.id.clone(),
&unfinished.run.hash,
&message,
))
} }
}; };
@ -89,9 +96,9 @@ async fn get_queue(
// Group workers by commit hash // Group workers by commit hash
let mut workers_by_commit: HashMap<String, Vec<WorkerLink>> = HashMap::new(); let mut workers_by_commit: HashMap<String, Vec<WorkerLink>> = HashMap::new();
for (name, info) in workers { for (name, info) in workers {
if let WorkerStatus::Working(run) = &info.status { if let WorkerStatus::Working(unfinished) = &info.status {
workers_by_commit workers_by_commit
.entry(run.hash.clone()) .entry(unfinished.run.hash.clone())
.or_default() .or_default()
.push(WorkerLink::new(base, name.clone())); .push(WorkerLink::new(base, name.clone()));
} }

View file

@ -5,7 +5,7 @@ use time::OffsetDateTime;
use crate::{ use crate::{
config::Config, config::Config,
id, id,
shared::{BenchMethod, UnfinishedRun, Work, WorkerStatus}, shared::{BenchMethod, Run, UnfinishedRun, WorkerStatus},
}; };
#[derive(Clone)] #[derive(Clone)]
@ -54,42 +54,49 @@ impl Workers {
self.workers.insert(name, info); self.workers.insert(name, info);
} }
/// Find and reserve work for a worker. pub fn find_and_reserve_run(
pub fn find_work(&mut self, name: &str, queue: &[String], bench: BenchMethod) -> Option<Work> { &mut self,
name: &str,
queue: &[String],
bench_method: BenchMethod,
) -> Option<Run> {
let covered = self let covered = self
.workers .workers
.values() .values()
.filter_map(|info| match &info.status { .filter_map(|info| match &info.status {
WorkerStatus::Idle | WorkerStatus::Busy => None, WorkerStatus::Idle | WorkerStatus::Busy => None,
WorkerStatus::Working(run) => Some(&run.hash), WorkerStatus::Working(unfinished) => Some(&unfinished.run.hash),
}) })
.collect::<HashSet<_>>(); .collect::<HashSet<_>>();
// Find work not already covered by another worker // Find work not already covered by another worker
let hash = queue.iter().find(|hash| !covered.contains(hash))?.clone(); let hash = queue.iter().find(|hash| !covered.contains(hash))?.clone();
let id = id::random_run_id(); let id = id::random_run_id();
let work = Work { id, hash, bench }; let run = Run {
id,
hash,
bench_method,
start: OffsetDateTime::now_utc(),
};
// Reserve work so other workers don't choose it // Reserve work so other workers don't choose it
if let Some(info) = self.workers.get_mut(name) { if let Some(info) = self.workers.get_mut(name) {
info.status = WorkerStatus::Working(UnfinishedRun { info.status = WorkerStatus::Working(UnfinishedRun {
id: work.id.clone(), run: run.clone(),
hash: work.hash.clone(),
start: OffsetDateTime::now_utc(),
last_output: vec![], last_output: vec![],
}); });
} }
Some(work) Some(run)
} }
pub fn should_abort_work(&self, name: &str, queue: &[String]) -> bool { pub fn should_abort_work(&self, name: &str, queue: &[String]) -> bool {
// A runner should abort work if... // A runner should abort work if...
let Some(info) = self.workers.get(name) else { return false; }; let Some(info) = self.workers.get(name) else { return false; };
let WorkerStatus::Working (run) = &info.status else { return false; }; let WorkerStatus::Working (unfinished) = &info.status else { return false; };
// The commit isn't in the queue // The commit isn't in the queue
if !queue.contains(&run.hash) { if !queue.contains(&unfinished.run.hash) {
return true; return true;
} }
@ -98,7 +105,9 @@ impl Workers {
.workers .workers
.iter() .iter()
.filter_map(|(name, info)| match &info.status { .filter_map(|(name, info)| match &info.status {
WorkerStatus::Working(r) if r.hash == run.hash => Some((name, r.start)), WorkerStatus::Working(u) if u.run.hash == unfinished.run.hash => {
Some((name, u.run.start))
}
_ => None, _ => None,
}) })
.max_by_key(|(_, start)| *start) .max_by_key(|(_, start)| *start)

View file

@ -10,6 +10,14 @@ fn is_false(b: &bool) -> bool {
!b !b
} }
#[derive(Clone, Serialize_repr, Deserialize_repr, sqlx::Type)]
#[repr(u8)]
pub enum Source {
// Stdin would be fd 0
Stdout = 1,
Stderr = 2,
}
#[derive(Clone, Serialize_repr, Deserialize_repr, sqlx::Type)] #[derive(Clone, Serialize_repr, Deserialize_repr, sqlx::Type)]
#[repr(i8)] #[repr(i8)]
pub enum Direction { pub enum Direction {
@ -29,34 +37,40 @@ pub struct Measurement {
pub direction: Option<Direction>, pub direction: Option<Direction>,
} }
#[derive(Clone, Serialize_repr, Deserialize_repr, sqlx::Type)] #[derive(Clone, Serialize, Deserialize)]
#[repr(u8)] #[serde(rename_all = "snake_case")]
pub enum Source { #[serde(tag = "type")]
// Stdin would be fd 0 pub enum BenchMethod {
Stdout = 1, Internal,
Stderr = 2, Repo { hash: String },
}
#[derive(Clone, Serialize, Deserialize)]
pub struct Run {
pub id: String,
pub hash: String,
pub bench_method: BenchMethod,
pub start: OffsetDateTime,
} }
#[derive(Clone, Serialize, Deserialize)] #[derive(Clone, Serialize, Deserialize)]
pub struct UnfinishedRun { pub struct UnfinishedRun {
pub id: String, #[serde(flatten)]
pub hash: String, pub run: Run,
pub start: OffsetDateTime,
#[serde(default)] #[serde(default)]
pub last_output: Vec<(Source, String)>, pub last_output: Vec<(Source, String)>,
} }
#[derive(Clone, Serialize, Deserialize)] #[derive(Clone, Serialize, Deserialize)]
pub struct FinishedRun { pub struct FinishedRun {
pub id: String, #[serde(flatten)]
pub hash: String, pub run: Run,
pub start: OffsetDateTime,
pub end: OffsetDateTime,
#[serde(default)] #[serde(default)]
pub exit_code: i32, pub exit_code: i32,
pub measurements: HashMap<String, Measurement>,
#[serde(default)] #[serde(default)]
pub output: Vec<(Source, String)>, pub output: Vec<(Source, String)>,
#[serde(default)]
pub measurements: HashMap<String, Measurement>,
} }
#[derive(Clone, Serialize, Deserialize)] #[derive(Clone, Serialize, Deserialize)]
@ -84,48 +98,28 @@ pub struct WorkerRequest {
/// What the worker is currently working on. /// What the worker is currently working on.
pub status: WorkerStatus, pub status: WorkerStatus,
/// Whether the worker wants new work from the server. /// The worker wants a new run from the server.
/// ///
/// If the server has a commit available, it should respond with a non-null /// If the server has a commit available, it should respond with a non-null
/// [`Response::work`]. /// [`ServerResponse::work`].
#[serde(default, skip_serializing_if = "is_false")] #[serde(default, skip_serializing_if = "is_false")]
pub request_work: bool, pub request_run: bool,
/// The worker has finished a run and wants to submit the results. /// The worker has finished a run and wants to submit the results.
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub submit_work: Option<FinishedRun>, pub submit_run: Option<FinishedRun>,
}
#[derive(Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
#[serde(tag = "type")]
pub enum BenchMethod {
/// Use internal (deterministic) benchmarking code.
Internal,
/// Use a commit from a bench repo.
Repo { hash: String },
}
#[derive(Clone, Serialize, Deserialize)]
pub struct Work {
/// Id of the run.
pub id: String,
/// Hash of commit to benchmark.
pub hash: String,
/// How to benchmark the commit.
pub bench: BenchMethod,
} }
#[derive(Serialize, Deserialize)] #[derive(Serialize, Deserialize)]
pub struct ServerResponse { pub struct ServerResponse {
/// Work the worker requested using [`Request::request_work]. /// Run the worker requested using [`RunnerRequest::request_run`].
/// ///
/// The worker may ignore this work and do something else. However, until /// The worker may ignore this run and do something else. However, until the
/// the next update request sent by the worker, the server will consider the /// next update request sent by the worker, the server will consider the
/// worker as preparing to work on the commit, and will not give out the /// worker as preparing to work on the commit, and will not give out the
/// same commit to other workers. /// same commit to other workers.
#[serde(skip_serializing_if = "Option::is_none")] #[serde(skip_serializing_if = "Option::is_none")]
pub work: Option<Work>, pub run: Option<Run>,
/// The worker should abort the current run. /// The worker should abort the current run.
/// ///
@ -133,5 +127,5 @@ pub struct ServerResponse {
/// the same commit as another worker and has broken the tie in favor of the /// the same commit as another worker and has broken the tie in favor of the
/// other worker. The worker may continue the run despite this flag. /// other worker. The worker may continue the run despite this flag.
#[serde(default, skip_serializing_if = "is_false")] #[serde(default, skip_serializing_if = "is_false")]
pub abort_work: bool, pub abort_run: bool,
} }

View file

@ -47,8 +47,8 @@ impl Server {
info: None, info: None,
secret: self.secret.clone(), secret: self.secret.clone(),
status, status,
request_work, request_run: request_work,
submit_work, submit_run: submit_work,
}; };
let response = self let response = self