feat: implement launcher orchestrator and startup monitoring infrastructure for host lifecycle management

This commit is contained in:
lincube
2026-06-14 12:59:36 +08:00
parent 13895e0f43
commit 2793be68d4
8 changed files with 239 additions and 30 deletions

View File

@@ -208,13 +208,15 @@ internal sealed class HostLaunchService
private static async Task EnsureAirAppRuntimeStartedAsync(string appRoot, string? dataRoot)
{
Logger.Info("HOST LAUNCH: Attempting to pre-start AirApp Runtime...");
try
{
await new AirAppRuntimeBridge(appRoot, dataRoot).EnsureStartedAsync().ConfigureAwait(false);
Logger.Info("HOST LAUNCH: AirApp Runtime pre-start completed.");
}
catch (Exception ex)
{
Logger.Warn($"AirApp Runtime pre-start failed; Host fallback remains available. Error='{ex.Message}'.");
Logger.Warn($"HOST LAUNCH: AirApp Runtime pre-start failed; Host fallback remains available. Error='{ex.Message}'");
}
}
@@ -249,6 +251,11 @@ internal sealed class HostLaunchService
try
{
Logger.Info($"ATTEMPTING HOST START: Path='{plan.HostPath}'; WorkingDir='{plan.WorkingDirectory}'; Mode='{startMode}'");
Logger.Info($" Arguments: {HostLaunchPlanBuilder.FormatArgumentsForLog(plan.Arguments)}");
Logger.Info($" File exists: {File.Exists(plan.HostPath)}");
Logger.Info($" Working dir exists: {Directory.Exists(plan.WorkingDirectory)}");
var process = Process.Start(startInfo);
Logger.Info(
$"Host launch requested. Mode='{startMode}'; RetryTag='{retryTag ?? "<none>"}'; Path='{plan.HostPath}'; " +
@@ -257,15 +264,30 @@ internal sealed class HostLaunchService
if (process is null)
{
Logger.Error($"CRITICAL: Process.Start returned null! Path='{plan.HostPath}'; Mode='{startMode}'");
Console.Error.WriteLine($"[CRITICAL] Process.Start returned null for path: {plan.HostPath}");
return HostStartAttempt.StartFailed(startMode, "process_start_returned_null", plan);
}
await Task.Yield();
// 等待一小段时间,检查进程是否立即退出
await Task.Delay(500).ConfigureAwait(false);
if (process.HasExited)
{
Logger.Error($"CRITICAL: Host process exited immediately! ExitCode={process.ExitCode}; Path='{plan.HostPath}'");
Console.Error.WriteLine($"[CRITICAL] Host process exited immediately with code {process.ExitCode}");
return HostStartAttempt.StartFailed(startMode, $"process_exited_immediately_code_{process.ExitCode}", plan);
}
Logger.Info($"Host process started successfully and is running. PID={process.Id}");
return HostStartAttempt.Started(startMode, process, plan);
}
catch (Exception ex)
{
Logger.Error($"Host start failed. Mode='{startMode}'.", ex);
Logger.Error($"CRITICAL: Host start exception! Path='{plan.HostPath}'; Mode='{startMode}'; Exception={ex.GetType().Name}; Message='{ex.Message}'", ex);
Console.Error.WriteLine($"[CRITICAL] Host start failed: {ex.Message}");
Console.Error.WriteLine($"[CRITICAL] Path: {plan.HostPath}");
Console.Error.WriteLine($"[CRITICAL] Exception: {ex}");
return HostStartAttempt.StartFailed(startMode, ex.GetType().Name, plan);
}
}

View File

@@ -86,7 +86,7 @@ internal sealed class HostStartupMonitor
]).ConfigureAwait(false);
if (!connected)
{
Logger.Info("Host public IPC is not ready yet. Launcher will keep monitoring the host process and retry.");
Logger.Info("Host public IPC is not ready yet after initial connection attempts. Launcher will keep monitoring the host process and retry periodically.");
}
else
{
@@ -106,6 +106,8 @@ internal sealed class HostStartupMonitor
var nextShellStatusPollAt = DateTimeOffset.UtcNow + StartupTimeoutPolicy.ShellStatusPollInterval;
var ipcReconnectAttemptIndex = 0;
var activationRetryAttempted = false;
var lastIpcConnectionFailureReported = DateTimeOffset.MinValue;
var ipcConnectionFailureCount = 0;
while (true)
{
@@ -224,6 +226,7 @@ internal sealed class HostStartupMonitor
if (connected)
{
ipcConnected = true;
Logger.Info($"Host public IPC reconnected successfully after {ipcConnectionFailureCount} failed attempts.");
var shellSuccess = await RefreshShellStatusAsync("Host public IPC reconnected; waiting for desktop shell.")
.ConfigureAwait(false);
if (shellSuccess is not null)
@@ -232,6 +235,18 @@ internal sealed class HostStartupMonitor
continue;
}
}
else
{
ipcConnectionFailureCount++;
// 每 30 秒报告一次 IPC 连接失败
if ((now - lastIpcConnectionFailureReported).TotalSeconds >= 30)
{
lastIpcConnectionFailureReported = now;
var elapsed = (now - startedAt).TotalSeconds;
Logger.Warn($"Host public IPC connection still unavailable after {elapsed:0}s and {ipcConnectionFailureCount} reconnect attempts. Host process is alive (PID={request.HostProcess.Id}).");
request.Reporter.Report("diagnostic", $"正在等待主应用响应... (已尝试 {ipcConnectionFailureCount} 次)");
}
}
nextReconnectAttemptAt = DateTimeOffset.UtcNow + StartupTimeoutPolicy.IpcReconnectInterval;
}
@@ -263,6 +278,16 @@ internal sealed class HostStartupMonitor
nextCheckpointAt = softTimeoutAt;
}
if (!ipcConnected && nextReconnectAttemptAt < nextCheckpointAt)
{
nextCheckpointAt = nextReconnectAttemptAt;
}
if (ipcConnected && nextShellStatusPollAt < nextCheckpointAt)
{
nextCheckpointAt = nextShellStatusPollAt;
}
var delay = nextCheckpointAt - now;
if (delay > TimeSpan.FromSeconds(1))
{
@@ -351,11 +376,11 @@ internal sealed class HostStartupMonitor
if (!connected && !request.HostProcess.HasExited)
{
request.AttemptRegistry.MarkOwnedWaitingForShell("Host process is still running, but public IPC is not ready yet.");
request.PublishCoordinatorStatus(true, false, true);
request.PublishCoordinatorStatus(true, true, false);
return new Outcome(
true,
"startup_pending",
"Host process is still running; Launcher will not start another process while public IPC finishes startup.",
false,
"ipc_connection_failed",
$"Host process is still running after {StartupTimeoutPolicy.HardTimeout.TotalSeconds:0} seconds, but public IPC connection could not be established. This may indicate the host is stuck during initialization.",
recoveryActivationAttempted,
request.ComposeLaunchDetails(true, recoveryActivationAttempted));
}

View File

@@ -89,6 +89,14 @@ internal sealed class StartupAttemptRegistry
ExecuteWithLock(() =>
{
var existing = LoadUnsafe();
// 清理过期的记录
if (existing is not null && IsStaleAttempt(existing))
{
Logger.Info($"Cleaning up stale startup attempt record. AttemptId='{existing.AttemptId}'; State='{existing.State}'; Age={(DateTimeOffset.UtcNow - existing.UpdatedAtUtc).TotalMinutes:0.1}min.");
existing = null;
}
if (existing is not null && IsCoordinatorLive(existing))
{
active = Clone(existing);
@@ -145,6 +153,34 @@ internal sealed class StartupAttemptRegistry
return reserved is not null;
}
private static bool IsStaleAttempt(StartupAttemptRecord record)
{
// 记录超过 10 分钟且状态为终结或非活跃状态
if (DateTimeOffset.UtcNow - record.UpdatedAtUtc > TimeSpan.FromMinutes(10))
{
return true;
}
// 进程已死且协调器心跳超时
if (record.CoordinatorPid > 0 &&
!TryGetLiveProcess(record.CoordinatorPid, out _) &&
DateTimeOffset.UtcNow - record.HeartbeatAtUtc > TimeSpan.FromMinutes(2))
{
return true;
}
// 主进程已死且协调器已死
if (record.HostPid > 0 &&
!TryGetLiveProcess(record.HostPid, out _) &&
record.CoordinatorPid > 0 &&
!TryGetLiveProcess(record.CoordinatorPid, out _))
{
return true;
}
return false;
}
public StartupAttemptRecord? GetOwnedAttempt()
{
StartupAttemptRecord? result = null;

View File

@@ -2,22 +2,26 @@ namespace LanMountainDesktop.Launcher.Startup;
internal static class StartupTimeoutPolicy
{
public static readonly TimeSpan SoftTimeout = TimeSpan.FromSeconds(30);
public static readonly TimeSpan HardTimeout = TimeSpan.FromSeconds(120);
public static readonly TimeSpan SoftTimeout = TimeSpan.FromSeconds(45);
public static readonly TimeSpan HardTimeout = TimeSpan.FromSeconds(180);
/// <summary>Initial Public IPC connect attempt (AOT cold start may be slower).</summary>
public static readonly TimeSpan InitialIpcConnectTimeout = TimeSpan.FromMilliseconds(1200);
/// <summary>Initial Public IPC connect attempt (AOT cold start is significantly slower).</summary>
public static readonly TimeSpan InitialIpcConnectTimeout = TimeSpan.FromMilliseconds(3000);
/// <summary>Subsequent reconnect attempts use increasing per-try timeouts.</summary>
public static readonly TimeSpan[] IpcReconnectAttemptTimeouts =
[
TimeSpan.FromMilliseconds(800),
TimeSpan.FromMilliseconds(1500),
TimeSpan.FromMilliseconds(3000),
TimeSpan.FromMilliseconds(5000)
TimeSpan.FromMilliseconds(5000),
TimeSpan.FromMilliseconds(8000),
TimeSpan.FromMilliseconds(10000)
];
public static readonly TimeSpan ExistingHostProbeTimeout = TimeSpan.FromMilliseconds(900);
public static readonly TimeSpan ExistingHostProbeTimeout = TimeSpan.FromMilliseconds(1500);
public static readonly TimeSpan ShellStatusPollInterval = TimeSpan.FromSeconds(1);
public static readonly TimeSpan IpcReconnectInterval = TimeSpan.FromSeconds(2);
public static readonly TimeSpan IpcReconnectInterval = TimeSpan.FromSeconds(3);
/// <summary>Maximum time to wait for host process exit after it starts (for early-exit detection).</summary>
public static readonly TimeSpan HostEarlyExitWindow = TimeSpan.FromSeconds(5);
}