Start runs and coordinate with server

This commit is contained in:
Joscha 2023-08-12 02:16:02 +02:00
parent f79468c871
commit b23fc6460f
6 changed files with 296 additions and 32 deletions

View file

@ -1,5 +1,8 @@
//! Coordinate performing runs across servers.
use std::time::Duration;
use time::OffsetDateTime;
use tokio::sync::mpsc;
struct Server {
@ -9,32 +12,48 @@ struct Server {
pub struct Coordinator {
servers: Vec<Server>,
current: usize,
active: usize,
active_since: OffsetDateTime,
busy: bool,
}
impl Coordinator {
pub fn new() -> Self {
Self {
servers: vec![],
current: 0,
active: 0,
active_since: OffsetDateTime::now_utc(),
busy: false,
}
}
pub fn register(&mut self, name: String, poke: mpsc::UnboundedSender<()>) {
// TODO Assert that no duplicate names exist?
self.servers.push(Server { name, poke });
}
pub fn active(&self, name: &str) -> bool {
if let Some(current) = self.servers.get(self.current) {
name == current.name
} else {
false
pub fn active(&self, name: &str) -> ActiveInfo {
let active_server = self.servers.get(self.active);
let active = active_server.filter(|s| s.name == name).is_some();
ActiveInfo {
active,
active_since: self.active_since,
busy: self.busy,
}
}
pub fn next(&mut self, name: &str) {
pub fn look_busy(&mut self, name: &str) {
// Check just to prevent weird shenanigans
if !self.active(name) {
if !self.active(name).active {
return;
}
self.busy = true;
}
pub fn move_to_next_server(&mut self, name: &str) {
// Check just to prevent weird shenanigans
if !self.active(name).active {
return;
}
@ -42,8 +61,10 @@ impl Coordinator {
// the previous check
assert!(!self.servers.is_empty());
self.current += 1;
self.current %= self.servers.len();
self.active += 1;
self.active %= self.servers.len();
self.active_since = OffsetDateTime::now_utc();
self.busy = false;
// When the worker seeks work and a queue is idle, the next server
// should be queried immediately. Otherwise, we'd introduce lots of
@ -61,8 +82,23 @@ impl Coordinator {
// will send two requests back-to-back: The first because their ping
// timeout ran out, and the second because they were poked. So far, I
// haven't been able to think of an elegant solution for this.
if self.current > 0 {
let _ = self.servers[self.current].poke.send(());
if self.active > 0 {
let _ = self.servers[self.active].poke.send(());
}
}
}
#[derive(Clone, Copy)]
pub struct ActiveInfo {
pub active: bool,
pub active_since: OffsetDateTime,
pub busy: bool,
}
impl ActiveInfo {
pub fn in_batch(&self, batch_duration: Duration) -> bool {
let batch_end = self.active_since + batch_duration;
let now = OffsetDateTime::now_utc();
now <= batch_end
}
}

93
src/worker/run.rs Normal file
View file

@ -0,0 +1,93 @@
use std::{
collections::HashMap,
sync::{Arc, Mutex},
};
use time::OffsetDateTime;
use tokio::sync::mpsc;
use crate::{
id,
shared::{BenchMethod, FinishedRun, Measurement, Source, UnfinishedRun},
};
const LIVE_SCROLLBACK: usize = 50;
pub enum FullRunStatus {
Unfinished(UnfinishedRun),
Finished(FinishedRun),
Aborted,
}
#[derive(Clone)]
pub enum RunStatus {
Unfinished,
Finished {
end: OffsetDateTime,
exit_code: i32,
measurements: HashMap<String, Measurement>,
},
Aborted,
}
#[derive(Clone)]
pub struct Run {
id: String,
hash: String,
start: OffsetDateTime,
output: Vec<(Source, String)>,
status: RunStatus,
}
impl Run {
pub fn new(hash: String) -> Self {
Self {
id: id::random_run_id(),
hash,
start: OffsetDateTime::now_utc(),
output: vec![],
status: RunStatus::Unfinished,
}
}
pub fn into_full_status(self) -> FullRunStatus {
match self.status {
RunStatus::Unfinished => FullRunStatus::Unfinished(UnfinishedRun {
id: self.id,
hash: self.hash,
start: self.start,
last_output: self
.output
.into_iter()
.rev()
.take(LIVE_SCROLLBACK)
.rev()
.collect(),
}),
RunStatus::Finished {
end,
exit_code,
measurements,
} => FullRunStatus::Finished(FinishedRun {
id: self.id,
hash: self.hash,
start: self.start,
end,
exit_code,
measurements,
output: self.output,
}),
RunStatus::Aborted => FullRunStatus::Aborted,
}
}
}
pub async fn run(
run: Arc<Mutex<Run>>,
abort_rx: mpsc::UnboundedReceiver<()>,
bench_method: BenchMethod,
) {
// TODO Implement
}

View file

@ -7,11 +7,15 @@ use tracing::{debug, info_span, warn, Instrument};
use crate::{
config::{Config, WorkerServerConfig},
id,
shared::{WorkerRequest, WorkerStatus},
shared::{FinishedRun, ServerResponse, WorkerRequest, WorkerStatus},
somehow,
worker::run::{self, FullRunStatus},
};
use super::coordinator::Coordinator;
use super::{
coordinator::{ActiveInfo, Coordinator},
run::Run,
};
pub struct Server {
name: String,
@ -20,6 +24,9 @@ pub struct Server {
coordinator: Arc<Mutex<Coordinator>>,
client: Client,
secret: String,
// TODO Cache bench dir
run: Option<(Arc<Mutex<Run>>, mpsc::UnboundedSender<()>)>,
}
impl Server {
@ -36,16 +43,19 @@ impl Server {
coordinator,
client: Client::new(),
secret: id::random_worker_secret(),
run: None,
}
}
pub async fn run(&mut self) {
// Register with coordinator
let (poke_tx, mut poke_rx) = mpsc::unbounded_channel();
self.coordinator
.lock()
.unwrap()
.register(self.name.clone(), poke_tx.clone());
// Main loop
let name = self.name.clone();
async {
loop {
@ -54,37 +64,147 @@ impl Server {
Err(e) => warn!("Error talking to server:\n{e:?}"),
}
// Wait for poke or until the ping delay elapses. If we get
// poked while pinging the server, this will not wait and we'll
// immediately do another ping.
let _ = tokio::time::timeout(self.config.worker_ping_delay, poke_rx.recv()).await;
// Empty queue in case we were poked more than once. This can
// happen for example if we get poked multiple times while
// pinging the server.
while poke_rx.try_recv().is_ok() {}
self.wait_until_next_ping(&mut poke_rx).await;
}
}
.instrument(info_span!("worker", name))
.await;
}
async fn ping(&self) -> somehow::Result<()> {
debug!("Pinging");
async fn wait_until_next_ping(&self, poke_rx: &mut mpsc::UnboundedReceiver<()>) {
// Wait for poke or until the ping delay elapses. If we get poked while
// pinging the server, this will not wait and we'll immediately do
// another ping.
let _ = tokio::time::timeout(self.config.worker_ping_delay, poke_rx.recv()).await;
// Empty queue in case we were poked more than once. This can happen for
// example if we get poked multiple times while pinging the server.
while poke_rx.try_recv().is_ok() {}
}
async fn ping(&mut self) -> somehow::Result<()> {
debug!("Pinging server");
let info = self.coordinator.lock().unwrap().active(&self.name);
if info.active {
self.ping_active(info).await?;
} else {
self.ping_inactive(info).await?;
}
Ok(())
}
async fn ping_inactive(&self, info: ActiveInfo) -> somehow::Result<()> {
assert!(self.run.is_none());
let status = match info.busy {
true => WorkerStatus::Busy,
false => WorkerStatus::Idle,
};
self.request(status, false, None).await?;
Ok(())
}
async fn ping_active(&mut self, info: ActiveInfo) -> somehow::Result<()> {
let run = self
.run
.as_ref()
.map(|(r, _)| r.lock().unwrap().clone().into_full_status())
.unwrap_or(FullRunStatus::Aborted);
let unfinished = matches!(run, FullRunStatus::Unfinished(_));
let aborted = matches!(run, FullRunStatus::Aborted);
let in_batch = info.in_batch(self.config.worker_batch_duration);
let (status, submit_work) = match run {
FullRunStatus::Unfinished(run) => (WorkerStatus::Working(run), None),
FullRunStatus::Finished(run) => (WorkerStatus::Idle, Some(run)),
FullRunStatus::Aborted => (WorkerStatus::Idle, None),
};
let request_work = in_batch && !unfinished;
let response = self.request(status, request_work, submit_work).await;
if response.is_err() && aborted {
// We have nothing important going on, let's defer to the next
// server and hope this one will respond again soon.
self.coordinator
.lock()
.unwrap()
.move_to_next_server(&self.name);
// Return explicitly to ensure we don't continue to the rest of the
// function in the false belief that we're active. Oh, and don't
// swallow the error.
response?;
return Ok(());
}
let response = response?;
// Clean up self.run if we no longer need it
if !unfinished {
// We can get rid of finished runs since we just successfully sent
// the server the results.
self.run = None;
}
// Abort run if server says so
if response.abort_work {
if let Some((_, abort_tx)) = &self.run {
let _ = abort_tx.send(());
}
}
// Start work (but only if we requested it)
if let Some(work) = response.work.filter(|_| request_work) {
assert!(!unfinished);
assert!(self.run.is_none());
let run = Arc::new(Mutex::new(Run::new(work.hash)));
let (abort_tx, abort_rx) = mpsc::unbounded_channel();
self.run = Some((run.clone(), abort_tx));
self.coordinator.lock().unwrap().look_busy(&self.name);
tokio::spawn(run::run(run, abort_rx, work.bench));
}
// Finally, advance to the next server if it makes sense to do so
if self.run.is_none() {
self.coordinator
.lock()
.unwrap()
.move_to_next_server(&self.name);
}
Ok(())
}
async fn request(
&self,
status: WorkerStatus,
request_work: bool,
submit_work: Option<FinishedRun>,
) -> somehow::Result<ServerResponse> {
let url = format!("{}api/worker/status", self.server_config.url);
let request = WorkerRequest {
info: None,
secret: self.secret.clone(),
status: WorkerStatus::Idle,
request_work: false,
submit_work: None,
status,
request_work,
submit_work,
};
let url = format!("{}api/worker/status", self.server_config.url);
self.client
let response = self
.client
.post(url)
.basic_auth(&self.config.worker_name, Some(&self.server_config.token))
.json(&request)
.send()
.await?
.json::<ServerResponse>()
.await?;
Ok(())
Ok(response)
}
}