forked from 1jehuang/jcode
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathserver.rs
More file actions
1807 lines (1678 loc) · 78.5 KB
/
server.rs
File metadata and controls
1807 lines (1678 loc) · 78.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
mod await_members_state;
mod background_tasks;
mod client_actions;
mod client_api;
mod client_comm;
mod client_comm_channels;
mod client_comm_context;
mod client_comm_message;
mod client_disconnect_cleanup;
mod client_lifecycle;
mod client_session;
mod client_state;
mod comm_await;
mod comm_control;
mod comm_plan;
mod comm_session;
mod comm_sync;
mod debug;
mod debug_ambient;
mod debug_command_exec;
mod debug_events;
mod debug_help;
mod debug_jobs;
mod debug_server_state;
mod debug_session_admin;
mod debug_swarm_read;
mod debug_swarm_write;
mod debug_testers;
mod durable_state;
mod headless;
mod lifecycle;
mod provider_control;
mod reload;
mod reload_recovery;
mod reload_state;
mod runtime;
mod socket;
mod swarm;
mod swarm_channels;
mod swarm_mutation_state;
mod swarm_persistence;
mod util;
pub(super) use self::await_members_state::AwaitMembersRuntime;
use self::background_tasks::{
dispatch_background_task_completion, dispatch_background_task_progress, dispatch_ui_activity,
};
use self::debug::{ClientConnectionInfo, ClientDebugState};
use self::debug_jobs::DebugJob;
use self::headless::create_headless_session;
use self::reload::await_reload_signal;
use self::runtime::ServerRuntime;
use self::swarm::{
broadcast_swarm_plan, broadcast_swarm_plan_with_previous, broadcast_swarm_status,
record_swarm_event, record_swarm_event_for_session, refresh_swarm_task_staleness,
remove_plan_participant, remove_session_file_touches, remove_session_from_swarm,
rename_plan_participant, run_swarm_message, update_member_status,
update_member_status_with_report,
};
use self::swarm_channels::{
remove_session_channel_subscriptions, subscribe_session_to_channel,
unsubscribe_session_from_channel,
};
pub(super) use self::swarm_mutation_state::SwarmMutationRuntime;
use self::swarm_persistence::{
LoadedSwarmRuntimeState, load_runtime_state as load_persisted_swarm_runtime_state,
persist_swarm_state as persist_swarm_state_snapshot,
remove_swarm_state as remove_persisted_swarm_state,
};
use self::util::get_shared_mcp_pool;
use crate::agent::Agent;
use crate::ambient_runner::AmbientRunnerHandle;
use crate::bus::{Bus, BusEvent};
use crate::protocol::{NotificationType, ServerEvent};
use crate::provider::Provider;
use crate::runtime_memory_log::{
RuntimeMemoryLogController, RuntimeMemoryLogSampling, RuntimeMemoryLogTrigger,
ServerRuntimeMemoryBackground, ServerRuntimeMemoryClients, ServerRuntimeMemoryEmbeddings,
ServerRuntimeMemorySample, ServerRuntimeMemoryServer, ServerRuntimeMemorySessions,
ServerRuntimeMemoryTopSession,
};
use crate::tool::selfdev::ReloadContext;
use crate::transport::Listener;
use anyhow::Result;
use jcode_agent_runtime::{InterruptSignal, SoftInterruptSource};
use jcode_swarm_core::{
append_swarm_completion_report_instructions, format_structured_completion_report,
summarize_plan_items, truncate_detail,
};
use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::sync::{Mutex, OnceCell, RwLock, broadcast, mpsc};
pub(super) type SessionAgents = Arc<RwLock<HashMap<String, Arc<Mutex<Agent>>>>>;
pub(super) type ChannelSubscriptions =
Arc<RwLock<HashMap<String, HashMap<String, HashSet<String>>>>>;
pub(super) async fn persist_swarm_state_for(swarm_id: &str, swarm_state: &SwarmState) {
let runtime = swarm_state.load_runtime(swarm_id).await;
persist_swarm_state_snapshot(
swarm_id,
runtime.plan.as_ref(),
runtime.coordinator_session_id.as_deref(),
&runtime.members,
);
}
pub(super) async fn remove_persisted_swarm_state_for(swarm_id: &str, swarm_state: &SwarmState) {
let runtime = swarm_state.load_runtime(swarm_id).await;
if runtime.has_any_state() {
return;
}
remove_persisted_swarm_state(swarm_id);
}
fn headless_member_should_restore(status: &str, is_headless: bool) -> bool {
is_headless && !matches!(status, "completed" | "done" | "failed" | "stopped")
}
fn headless_reload_continuation_message(reload_ctx: Option<ReloadContext>) -> Option<String> {
ReloadContext::recovery_directive(reload_ctx.as_ref(), true, "", None)
.map(|directive| directive.continuation_message)
}
#[derive(Default)]
struct HeadlessRecoveryStats {
candidates: usize,
resumed: usize,
skipped: usize,
failed_to_load: usize,
}
async fn capture_runtime_memory_common_sample(
identity: &ServerIdentity,
client_count: &Arc<RwLock<usize>>,
server_start_time: Instant,
kind: &str,
source: &str,
trigger: RuntimeMemoryLogTrigger,
sampling: RuntimeMemoryLogSampling,
) -> ServerRuntimeMemorySample {
let now = chrono::Utc::now();
let process =
crate::process_memory::snapshot_with_source(format!("server:runtime-log:{source}"));
let connected_count = *client_count.read().await;
let background_task_count = crate::background::global().list().await.len();
let embedder_stats = crate::embedding::stats();
let embedding_model_available = crate::embedding::is_model_available();
ServerRuntimeMemorySample {
schema_version: 2,
kind: kind.to_string(),
timestamp: now.to_rfc3339(),
timestamp_ms: now.timestamp_millis(),
source: source.to_string(),
trigger,
sampling,
server: ServerRuntimeMemoryServer {
id: identity.id.clone(),
name: identity.name.clone(),
icon: identity.icon.clone(),
version: identity.version.clone(),
git_hash: identity.git_hash.clone(),
uptime_secs: server_start_time.elapsed().as_secs(),
},
process_diagnostics: crate::runtime_memory_log::build_process_diagnostics(&process),
process,
clients: ServerRuntimeMemoryClients { connected_count },
sessions: None,
background: ServerRuntimeMemoryBackground {
task_count: background_task_count,
},
embeddings: ServerRuntimeMemoryEmbeddings {
model_available: embedding_model_available,
stats: embedder_stats,
},
}
}
async fn capture_runtime_memory_process_sample(
identity: &ServerIdentity,
client_count: &Arc<RwLock<usize>>,
server_start_time: Instant,
source: &str,
trigger: RuntimeMemoryLogTrigger,
sampling: RuntimeMemoryLogSampling,
) -> ServerRuntimeMemorySample {
capture_runtime_memory_common_sample(
identity,
client_count,
server_start_time,
"process",
source,
trigger,
sampling,
)
.await
}
async fn capture_runtime_memory_attribution_sample(
identity: &ServerIdentity,
sessions: &SessionAgents,
client_count: &Arc<RwLock<usize>>,
server_start_time: Instant,
source: &str,
trigger: RuntimeMemoryLogTrigger,
sampling: RuntimeMemoryLogSampling,
) -> ServerRuntimeMemorySample {
let mut sample = capture_runtime_memory_common_sample(
identity,
client_count,
server_start_time,
"attribution",
source,
trigger,
sampling,
)
.await;
let sessions_guard = sessions.read().await;
let live_count = sessions_guard.len();
let mut sampled_count = 0usize;
let mut contended_count = 0usize;
let mut memory_enabled_session_count = 0usize;
let mut total_message_count = 0u64;
let mut total_provider_cache_message_count = 0u64;
let mut total_json_bytes = 0u64;
let mut total_payload_text_bytes = 0u64;
let mut total_provider_cache_json_bytes = 0u64;
let mut total_tool_result_bytes = 0u64;
let mut total_provider_cache_tool_result_bytes = 0u64;
let mut total_large_blob_bytes = 0u64;
let mut total_provider_cache_large_blob_bytes = 0u64;
let mut top_sessions: Vec<ServerRuntimeMemoryTopSession> = Vec::new();
for (session_id, agent_arc) in sessions_guard.iter() {
let Ok(mut agent) = agent_arc.try_lock() else {
contended_count += 1;
continue;
};
sampled_count += 1;
let profile = agent.session_memory_profile_snapshot();
let memory_enabled = agent.memory_enabled();
if memory_enabled {
memory_enabled_session_count += 1;
}
let message_count = profile.message_count as u64;
let provider_cache_message_count = profile.provider_cache_message_count as u64;
let json_bytes = profile.total_json_bytes as u64;
let payload_text_bytes = profile.payload_text_bytes as u64;
let provider_cache_json_bytes = profile.provider_cache_json_bytes as u64;
let tool_result_bytes = profile.canonical_tool_result_bytes as u64;
let provider_cache_tool_result_bytes = profile.provider_cache_tool_result_bytes as u64;
let large_blob_bytes = profile.canonical_large_blob_bytes as u64;
let provider_cache_large_blob_bytes = profile.provider_cache_large_blob_bytes as u64;
total_message_count += message_count;
total_provider_cache_message_count += provider_cache_message_count;
total_json_bytes += json_bytes;
total_payload_text_bytes += payload_text_bytes;
total_provider_cache_json_bytes += provider_cache_json_bytes;
total_tool_result_bytes += tool_result_bytes;
total_provider_cache_tool_result_bytes += provider_cache_tool_result_bytes;
total_large_blob_bytes += large_blob_bytes;
total_provider_cache_large_blob_bytes += provider_cache_large_blob_bytes;
top_sessions.push(ServerRuntimeMemoryTopSession {
session_id: session_id.clone(),
provider: agent.provider_name(),
model: agent.provider_model(),
memory_enabled,
message_count,
provider_cache_message_count,
json_bytes,
payload_text_bytes,
provider_cache_json_bytes,
tool_result_bytes,
provider_cache_tool_result_bytes,
large_blob_bytes,
provider_cache_large_blob_bytes,
});
}
drop(sessions_guard);
top_sessions.sort_by(|left, right| right.json_bytes.cmp(&left.json_bytes));
top_sessions.truncate(5);
sample.sessions = Some(ServerRuntimeMemorySessions {
live_count,
sampled_count,
contended_count,
memory_enabled_session_count,
total_message_count,
total_provider_cache_message_count,
total_json_bytes,
total_payload_text_bytes,
total_provider_cache_json_bytes,
total_tool_result_bytes,
total_provider_cache_tool_result_bytes,
total_large_blob_bytes,
total_provider_cache_large_blob_bytes,
top_by_json_bytes: top_sessions,
});
sample
}
mod state;
use self::state::latest_peer_touches;
pub use self::state::{
FileAccess, SessionControlHandle, SharedContext, SwarmEvent, SwarmEventType, SwarmMember,
SwarmState,
};
use self::state::{
SessionInterruptQueues, fanout_live_client_event, fanout_session_event,
queue_soft_interrupt_for_session, register_session_event_sender,
register_session_interrupt_queue, remove_session_interrupt_queue,
rename_session_interrupt_queue, session_event_fanout_sender, unregister_session_event_sender,
};
pub use crate::plan::{SwarmTaskProgress, VersionedPlan};
pub use self::await_members_state::pending_await_members_for_session;
use self::reload_state::clear_reload_marker_if_stale_for_pid;
#[cfg(test)]
pub(crate) use self::reload_state::subscribe_reload_signal_for_tests;
pub use self::reload_state::{
ReloadAck, ReloadPhase, ReloadSignal, ReloadState, ReloadWaitStatus, acknowledge_reload_signal,
await_reload_handoff, clear_reload_marker, inspect_reload_wait_status,
publish_reload_socket_ready, recent_reload_state, reload_marker_active, reload_marker_exists,
reload_marker_path, reload_process_alive, reload_state_summary, send_reload_signal,
wait_for_reload_ack, wait_for_reload_handoff_event, write_reload_marker, write_reload_state,
};
pub(crate) use self::lifecycle::configure_temporary_server;
#[cfg(unix)]
pub use self::socket::spawn_server_notify;
#[cfg(unix)]
use self::socket::{acquire_daemon_lock, mark_close_on_exec};
pub use self::socket::{
cleanup_socket_pair, connect_socket, debug_socket_path, has_live_listener, is_server_ready,
set_socket_path, socket_path, wait_for_server_ready,
};
use self::socket::{signal_ready_fd, socket_has_live_listener};
pub use self::util::ServerIdentity;
use self::util::{
debug_control_allowed, embedding_idle_unload_secs, git_common_dir_for, server_has_newer_binary,
server_update_candidate, startup_headless_recovery_test_delay, swarm_id_for_dir,
};
mod file_activity;
use self::file_activity::file_activity_scope_label;
#[cfg(test)]
mod socket_tests;
#[cfg(test)]
mod startup_tests;
#[cfg(test)]
mod queue_tests;
#[cfg(test)]
mod file_activity_tests;
/// Idle timeout for the shared server when no clients are connected (5 minutes)
const IDLE_TIMEOUT_SECS: u64 = 300;
/// How often to check whether the embedding model can be unloaded.
const EMBEDDING_IDLE_CHECK_SECS: u64 = 30;
/// Exit code when server shuts down due to idle timeout
pub const EXIT_IDLE_TIMEOUT: i32 = 44;
/// Server state
pub struct Server {
provider: Arc<dyn Provider>,
socket_path: PathBuf,
debug_socket_path: PathBuf,
gateway_config_override: Option<crate::gateway::GatewayConfig>,
/// Server identity for multi-server support
identity: ServerIdentity,
/// Broadcast channel for streaming events to all subscribers
event_tx: broadcast::Sender<ServerEvent>,
/// Active sessions (session_id -> Agent)
sessions: Arc<RwLock<HashMap<String, Arc<Mutex<Agent>>>>>,
/// Current processing state
is_processing: Arc<RwLock<bool>>,
/// Session ID for the default session
session_id: Arc<RwLock<String>>,
/// Number of connected clients
client_count: Arc<RwLock<usize>>,
/// Connected client mapping (client_id -> session_id)
client_connections: Arc<RwLock<HashMap<String, ClientConnectionInfo>>>,
/// Track file touches: path -> list of accesses
file_touches: Arc<RwLock<HashMap<PathBuf, Vec<FileAccess>>>>,
/// Reverse index for file touches: session_id -> touched paths
files_touched_by_session: Arc<RwLock<HashMap<String, HashSet<PathBuf>>>>,
/// Shared ownership of core swarm coordination state.
swarm_state: SwarmState,
/// Shared context by swarm (swarm_id -> key -> SharedContext)
shared_context: Arc<RwLock<HashMap<String, HashMap<String, SharedContext>>>>,
/// Active and available TUI debug channels (request_id, command)
client_debug_state: Arc<RwLock<ClientDebugState>>,
/// Channel to receive client debug responses from TUI (request_id, response)
client_debug_response_tx: broadcast::Sender<(u64, String)>,
/// Background debug jobs (async debug commands)
debug_jobs: Arc<RwLock<HashMap<String, DebugJob>>>,
/// Channel subscriptions (swarm_id -> channel -> session_ids)
channel_subscriptions: ChannelSubscriptions,
/// Reverse index for channel subscriptions: session_id -> swarm_id -> channels
channel_subscriptions_by_session: ChannelSubscriptions,
/// Event history for real-time event subscription (ring buffer)
event_history: Arc<RwLock<std::collections::VecDeque<SwarmEvent>>>,
/// Counter for event IDs
event_counter: Arc<std::sync::atomic::AtomicU64>,
/// Broadcast channel for swarm event subscriptions (debug socket subscribers)
swarm_event_tx: broadcast::Sender<SwarmEvent>,
/// Ambient mode runner handle (None if ambient is disabled)
ambient_runner: Option<AmbientRunnerHandle>,
/// Shared MCP server pool (processes shared across sessions), initialized lazily.
mcp_pool: Arc<OnceCell<Arc<crate::mcp::SharedMcpPool>>>,
/// Graceful shutdown signals by session_id (stored outside agent mutex so they
/// can be signaled without locking the agent during active tool execution)
shutdown_signals: Arc<RwLock<HashMap<String, InterruptSignal>>>,
/// Soft interrupt queues by session_id (stored outside agent mutex so swarm/debug
/// notifications can be enqueued while an agent is actively processing)
soft_interrupt_queues: SessionInterruptQueues,
/// Persisted communicate await_members wait registry.
await_members_runtime: AwaitMembersRuntime,
/// Persisted dedupe registry for mutating swarm coordinator operations.
swarm_mutation_runtime: SwarmMutationRuntime,
}
impl Server {
pub fn new(provider: Arc<dyn Provider>) -> Self {
use crate::id::{new_memorable_server_id, server_icon};
let (event_tx, _) = broadcast::channel(1024);
let (client_debug_response_tx, _) = broadcast::channel(64);
// Generate a memorable server name
let (id, name) = new_memorable_server_id();
let icon = server_icon(&name).to_string();
let identity = ServerIdentity {
id,
name,
icon,
git_hash: env!("JCODE_GIT_HASH").to_string(),
version: env!("JCODE_VERSION").to_string(),
};
crate::process_title::set_server_title(&identity.name);
// Initialize the background runner even when ambient mode is disabled so
// session-targeted scheduled tasks still have a live delivery loop.
let ambient_runner = {
let safety = Arc::new(crate::safety::SafetySystem::new());
let handle = AmbientRunnerHandle::new(safety);
crate::tool::ambient::init_schedule_runner(handle.clone());
Some(handle)
};
let LoadedSwarmRuntimeState {
plans: restored_swarm_plans,
coordinators: restored_swarm_coordinators,
members: restored_swarm_members,
swarms_by_id: restored_swarms_by_id,
} = load_persisted_swarm_runtime_state();
Self {
provider,
socket_path: socket_path(),
debug_socket_path: debug_socket_path(),
gateway_config_override: None,
identity,
event_tx,
sessions: Arc::new(RwLock::new(HashMap::new())),
is_processing: Arc::new(RwLock::new(false)),
session_id: Arc::new(RwLock::new(String::new())),
client_count: Arc::new(RwLock::new(0)),
client_connections: Arc::new(RwLock::new(HashMap::new())),
file_touches: Arc::new(RwLock::new(HashMap::new())),
files_touched_by_session: Arc::new(RwLock::new(HashMap::new())),
swarm_state: SwarmState::new(
restored_swarm_members,
restored_swarms_by_id,
restored_swarm_plans,
restored_swarm_coordinators,
),
shared_context: Arc::new(RwLock::new(HashMap::new())),
client_debug_state: Arc::new(RwLock::new(ClientDebugState::default())),
client_debug_response_tx,
debug_jobs: Arc::new(RwLock::new(HashMap::new())),
channel_subscriptions: Arc::new(RwLock::new(HashMap::new())),
channel_subscriptions_by_session: Arc::new(RwLock::new(HashMap::new())),
event_history: Arc::new(RwLock::new(std::collections::VecDeque::new())),
event_counter: Arc::new(std::sync::atomic::AtomicU64::new(1)),
swarm_event_tx: broadcast::channel(256).0,
ambient_runner,
mcp_pool: Arc::new(OnceCell::new()),
shutdown_signals: Arc::new(RwLock::new(HashMap::new())),
soft_interrupt_queues: Arc::new(RwLock::new(HashMap::new())),
await_members_runtime: AwaitMembersRuntime::default(),
swarm_mutation_runtime: SwarmMutationRuntime::default(),
}
}
pub fn new_with_paths(
provider: Arc<dyn Provider>,
socket_path: PathBuf,
debug_socket_path: PathBuf,
) -> Self {
let mut server = Self::new(provider);
server.socket_path = socket_path;
server.debug_socket_path = debug_socket_path;
server
}
pub fn with_gateway_config(mut self, gateway_config: crate::gateway::GatewayConfig) -> Self {
self.gateway_config_override = Some(gateway_config);
self
}
/// Get the server identity
pub fn identity(&self) -> &ServerIdentity {
&self.identity
}
fn runtime(&self) -> ServerRuntime {
ServerRuntime::from_server(self)
}
fn build_registry_info(&self) -> crate::registry::ServerInfo {
crate::registry::ServerInfo {
id: self.identity.id.clone(),
name: self.identity.name.clone(),
icon: self.identity.icon.clone(),
socket: self.socket_path.clone(),
debug_socket: self.debug_socket_path.clone(),
git_hash: self.identity.git_hash.clone(),
version: self.identity.version.clone(),
pid: std::process::id(),
started_at: chrono::Utc::now().to_rfc3339(),
sessions: Vec::new(),
}
}
fn spawn_registry_prewarm(&self) {
let registry_warm_provider = Arc::clone(&self.provider);
tokio::spawn(async move {
let start = Instant::now();
let provider = registry_warm_provider.fork();
let _ = crate::tool::Registry::new(provider).await;
crate::logging::info(&format!(
"Registry prewarm completed in {}ms",
start.elapsed().as_millis()
));
});
}
async fn recover_headless_sessions_on_startup(&self) {
let sessions_to_restore = {
let members = self.swarm_state.members.read().await;
members
.values()
.filter(|member| headless_member_should_restore(&member.status, member.is_headless))
.map(|member| member.session_id.clone())
.collect::<Vec<_>>()
};
if sessions_to_restore.is_empty() {
return;
}
crate::logging::info(&format!(
"Recovering {} headless session(s) after startup: {:?}",
sessions_to_restore.len(),
sessions_to_restore
));
if let Some(delay) = startup_headless_recovery_test_delay() {
crate::logging::info(&format!(
"Applying test-only headless startup recovery delay of {}ms",
delay.as_millis()
));
tokio::time::sleep(delay).await;
}
let mcp_pool = get_shared_mcp_pool(&self.mcp_pool).await;
let recovery_started = Instant::now();
let mut stats = HeadlessRecoveryStats::default();
let mut swarms_to_persist = HashSet::new();
for session_id in sessions_to_restore {
stats.candidates += 1;
let session = match crate::session::Session::load(&session_id) {
Ok(session) => session,
Err(error) => {
stats.failed_to_load += 1;
crate::logging::warn(&format!(
"Failed to load headless session {} during startup recovery: {}",
session_id, error
));
update_member_status(
&session_id,
"failed",
Some(truncate_detail(&error.to_string(), 120)),
&self.swarm_state.members,
&self.swarm_state.swarms_by_id,
Some(&self.event_history),
Some(&self.event_counter),
Some(&self.swarm_event_tx),
)
.await;
if let Some(swarm_id) = {
let members = self.swarm_state.members.read().await;
members
.get(&session_id)
.and_then(|member| member.swarm_id.clone())
} {
persist_swarm_state_for(&swarm_id, &self.swarm_state).await;
}
continue;
}
};
let previous_status = session.status.clone();
let provider = self.provider.fork();
let registry = crate::tool::Registry::new(provider.clone()).await;
if session.is_canary {
registry.register_selfdev_tools().await;
}
registry
.register_mcp_tools(
None,
Some(Arc::clone(&mcp_pool)),
Some("headless".to_string()),
)
.await;
let agent = Arc::new(Mutex::new(Agent::new_with_session(
provider, registry, session, None,
)));
{
let mut sessions = self.sessions.write().await;
if sessions.contains_key(&session_id) {
continue;
}
sessions.insert(session_id.clone(), Arc::clone(&agent));
}
{
let agent_guard = agent.lock().await;
register_session_interrupt_queue(
&self.soft_interrupt_queues,
&session_id,
agent_guard.soft_interrupt_queue(),
)
.await;
let mut shutdown_signals = self.shutdown_signals.write().await;
shutdown_signals.insert(session_id.clone(), agent_guard.graceful_shutdown_signal());
}
let has_stored_recovery_intent = reload_recovery::has_pending_for_session(&session_id);
let should_resume = has_stored_recovery_intent || {
let agent_guard = agent.lock().await;
self::client_session::restored_session_was_interrupted(
&session_id,
&previous_status,
&agent_guard,
)
};
if !should_resume {
ReloadContext::log_recovery_outcome(
"server_startup_headless",
&session_id,
"skipped",
"restored session was not interrupted by reload",
);
stats.skipped += 1;
update_member_status(
&session_id,
"ready",
None,
&self.swarm_state.members,
&self.swarm_state.swarms_by_id,
Some(&self.event_history),
Some(&self.event_counter),
Some(&self.swarm_event_tx),
)
.await;
if let Some(swarm_id) = {
let members = self.swarm_state.members.read().await;
members
.get(&session_id)
.and_then(|member| member.swarm_id.clone())
} {
swarms_to_persist.insert(swarm_id);
}
continue;
}
let stored_directive = reload_recovery::claim_pending_for_session(&session_id)
.ok()
.flatten();
let reload_ctx = if stored_directive.is_none() {
ReloadContext::load_for_session(&session_id).ok().flatten()
} else {
None
};
let reminder = stored_directive
.map(|directive| directive.continuation_message)
.or_else(|| headless_reload_continuation_message(reload_ctx));
let Some(reminder) = reminder else {
ReloadContext::log_recovery_outcome(
"server_startup_headless",
&session_id,
"failed",
"recovery directive missing for interrupted headless session",
);
continue;
};
stats.resumed += 1;
ReloadContext::log_recovery_outcome(
"server_startup_headless",
&session_id,
"resuming",
"restored interrupted headless session after reload",
);
let recover_swarm_members = Arc::clone(&self.swarm_state.members);
let recover_swarms_by_id = Arc::clone(&self.swarm_state.swarms_by_id);
let recover_event_history = Arc::clone(&self.event_history);
let recover_event_counter = Arc::clone(&self.event_counter);
let recover_swarm_event_tx = self.swarm_event_tx.clone();
let recover_swarm_state = self.swarm_state.clone();
tokio::spawn(async move {
update_member_status(
&session_id,
"running",
Some("resuming after reload".to_string()),
&recover_swarm_members,
&recover_swarms_by_id,
Some(&recover_event_history),
Some(&recover_event_counter),
Some(&recover_swarm_event_tx),
)
.await;
if let Some(swarm_id) = {
let members = recover_swarm_members.read().await;
members
.get(&session_id)
.and_then(|member| member.swarm_id.clone())
} {
persist_swarm_state_for(&swarm_id, &recover_swarm_state).await;
}
let event_tx = self::state::session_event_fanout_sender(
session_id.clone(),
Arc::clone(&recover_swarm_members),
);
let result = self::client_lifecycle::process_message_streaming_mpsc(
Arc::clone(&agent),
"",
vec![],
Some(reminder),
event_tx,
)
.await;
let (status, detail) = match result {
Ok(()) => {
ReloadContext::log_recovery_outcome(
"server_startup_headless",
&session_id,
"resumed",
"continuation dispatched successfully",
);
("ready", None)
}
Err(error) => {
ReloadContext::log_recovery_outcome(
"server_startup_headless",
&session_id,
"failed",
&error.to_string(),
);
("failed", Some(truncate_detail(&error.to_string(), 120)))
}
};
update_member_status(
&session_id,
status,
detail,
&recover_swarm_members,
&recover_swarms_by_id,
Some(&recover_event_history),
Some(&recover_event_counter),
Some(&recover_swarm_event_tx),
)
.await;
if let Some(swarm_id) = {
let members = recover_swarm_members.read().await;
members
.get(&session_id)
.and_then(|member| member.swarm_id.clone())
} {
persist_swarm_state_for(&swarm_id, &recover_swarm_state).await;
}
});
}
for swarm_id in swarms_to_persist {
persist_swarm_state_for(&swarm_id, &self.swarm_state).await;
}
crate::logging::info(&format!(
"[TIMING] headless reload startup recovery: candidates={}, resumed={}, skipped={}, failed_to_load={}, total={}ms",
stats.candidates,
stats.resumed,
stats.skipped,
stats.failed_to_load,
recovery_started.elapsed().as_millis()
));
}
async fn finish_startup_after_bind(
&self,
main_listener: Listener,
debug_listener: Listener,
server_start_time: Instant,
) -> (tokio::task::JoinHandle<()>, tokio::task::JoinHandle<()>) {
self.spawn_registry_prewarm();
let registry_info = self.build_registry_info();
let runtime = self.runtime();
let main_handle = runtime.spawn_main_accept_loop(main_listener);
let debug_handle = runtime.spawn_debug_accept_loop(debug_listener, server_start_time);
crate::logging::info("Accept loop tasks spawned");
// Signal readiness to the spawning client only after the accept loops
// are live, so a "ready" server can immediately handle requests.
publish_reload_socket_ready();
signal_ready_fd();
// Persist auxiliary discovery metadata after the server is already live.
self.spawn_registry_metadata_publisher(registry_info);
// Spawn WebSocket gateway for iOS/web clients (if enabled)
let _gateway_handle = self.spawn_gateway(runtime);
// Startup recovery can be expensive in multi-session reloads. Run it
// only after the replacement daemon is already accepting reconnects.
self.recover_headless_sessions_on_startup().await;
(main_handle, debug_handle)
}
fn spawn_background_tasks(
&self,
server_start_time: Instant,
temporary_server_policy: Option<lifecycle::TemporaryServerPolicy>,
) {
// Preload the embedding model in background so warm startups get fast
// memory recall. On a cold install, skip eager preload because the
// first-time model download can make the first spawned client look hung
// while the daemon finishes bootstrapping.
if crate::embedding::is_model_available() {
tokio::task::spawn_blocking(|| {
let start = std::time::Instant::now();
match crate::embedding::get_embedder() {
Ok(_) => {
crate::logging::info(&format!(
"Embedding model preloaded in {}ms",
start.elapsed().as_millis()
));
}
Err(e) => {
crate::logging::info(&format!(
"Embedding model preload failed (non-fatal): {}",
e
));
}
}
});
} else {
crate::logging::info(
"Embedding model not installed yet; skipping eager preload during server startup",
);
}
// Spawn reload monitor (event-driven via in-process channel).
// In the unified server design, self-dev sessions share the main server,
// so the shared server must always listen for reload signals.
let signal_sessions = Arc::clone(&self.sessions);
let signal_swarm_members = Arc::clone(&self.swarm_state.members);
let signal_shutdown_signals = Arc::clone(&self.shutdown_signals);
let signal_swarm_event_tx = self.swarm_event_tx.clone();
tokio::spawn(async move {
await_reload_signal(
signal_sessions,
signal_swarm_members,
signal_shutdown_signals,
signal_swarm_event_tx,
)
.await;
});
// Log when we receive SIGTERM for debugging
#[cfg(unix)]
{
let sigterm_server_name = self.identity.name.clone();
tokio::spawn(async move {
use tokio::signal::unix::{SignalKind, signal};
if let Ok(mut sigterm) = signal(SignalKind::terminate()) {
sigterm.recv().await;
crate::logging::info("Server received SIGTERM, shutting down gracefully");
let _ = crate::registry::unregister_server(&sigterm_server_name).await;
std::process::exit(0);
}
});
}
// Spawn the bus monitor for swarm coordination
let monitor_file_touches = Arc::clone(&self.file_touches);
let monitor_files_touched_by_session = Arc::clone(&self.files_touched_by_session);
let monitor_swarm_members = Arc::clone(&self.swarm_state.members);
let monitor_swarms_by_id = Arc::clone(&self.swarm_state.swarms_by_id);
let monitor_swarm_plans = Arc::clone(&self.swarm_state.plans);
let monitor_swarm_coordinators = Arc::clone(&self.swarm_state.coordinators);
let monitor_shared_context = Arc::clone(&self.shared_context);
let monitor_sessions = Arc::clone(&self.sessions);
let monitor_soft_interrupt_queues = Arc::clone(&self.soft_interrupt_queues);
let monitor_event_history = Arc::clone(&self.event_history);
let monitor_event_counter = Arc::clone(&self.event_counter);
let monitor_swarm_event_tx = self.swarm_event_tx.clone();
tokio::spawn(async move {
Self::monitor_bus(
monitor_file_touches,
monitor_files_touched_by_session,
monitor_swarm_members,
monitor_swarms_by_id,
monitor_swarm_plans,
monitor_swarm_coordinators,
monitor_shared_context,
monitor_sessions,
monitor_soft_interrupt_queues,
monitor_event_history,
monitor_event_counter,
monitor_swarm_event_tx,
)
.await;
});
let stale_swarm_members = Arc::clone(&self.swarm_state.members);
let stale_swarms_by_id = Arc::clone(&self.swarm_state.swarms_by_id);
let stale_swarm_plans = Arc::clone(&self.swarm_state.plans);
let stale_swarm_coordinators = Arc::clone(&self.swarm_state.coordinators);
tokio::spawn(async move {
let mut interval =
tokio::time::interval(crate::server::swarm::swarm_task_sweep_interval());
interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
loop {
interval.tick().await;
refresh_swarm_task_staleness(
&stale_swarm_members,
&stale_swarms_by_id,
&stale_swarm_plans,
&stale_swarm_coordinators,
)
.await;
}
});
// Initialize the memory agent early so it's ready for all sessions
if crate::config::config().features.memory {
tokio::spawn(async {
let _ = crate::memory_agent::init().await;
});
}
// Spawn the background ambient/schedule loop.
if let Some(ref runner) = self.ambient_runner {
let ambient_handle = runner.clone();
let ambient_provider = Arc::clone(&self.provider);
crate::logging::info("Starting ambient/schedule background loop");
tokio::spawn(async move {
ambient_handle.run_loop(ambient_provider).await;
});
}
// Spawn embedding idle monitor so the model can be unloaded when this
// server has been quiet for a while.