fix(openai-gateway): route APIKey accounts to /v1/chat/completions when upstream lacks Responses API

OpenAI APIKey accounts with base_url pointing to third-party OpenAI-compatible upstreams (DeepSeek, Kimi, GLM, Qwen, etc.) were failing because the gateway unconditionally converted Chat Completions requests to Responses format and forwarded to {base_url}/v1/responses, which only exists on OpenAI's official endpoint. Detection-based routing: - Probe upstream capability on account create/update via a minimal POST to /v1/responses; HTTP 404/405 means 'unsupported', any other response means 'supported'. - Persist result as accounts.extra.openai_responses_supported (bool). - ForwardAsChatCompletions branches at function entry: APIKey accounts with explicit support=false go through new forwardAsRawChatCompletions which passthrough-forwards CC body to /v1/chat/completions without protocol conversion. Default behavior for accounts without the marker preserves the legacy 'always Responses' path — existing OpenAI APIKey accounts that were working before this change continue to work without modification (the 'reality is evidence' principle: an account that has been running implies upstream capability). Probe is fired async after Create / Update / BatchCreate; failures only log, never block the admin flow. BulkUpdate omitted (low signal of base_url changes; can be added if needed). Implementation: - New pkg internal/pkg/openai_compat: marker key + ShouldUseResponsesAPI - New service file openai_apikey_responses_probe.go: probe + persist - New service file openai_gateway_chat_completions_raw.go: CC pass-through - Account test endpoint short-circuits with explicit message for probed-unsupported accounts (full CC test path is a TODO) Zero schema changes, zero migrations, zero frontend changes, zero wire modifications — all wired through existing AccountTestService injection. Closes: DeepSeek-OpenAI account (id=128) production failure
2026-05-04 21:20:51 +08:00 · 2026-04-30 19:25:45 +08:00
parent 48912014a1
commit 4e4cc80971
7 changed files with 708 additions and 3 deletions
--- a/backend/internal/service/openai_apikey_responses_probe.go
+++ b/backend/internal/service/openai_apikey_responses_probe.go
@@ -0,0 +1,149 @@
+package service
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+
+	"github.com/Wei-Shaw/sub2api/internal/pkg/logger"
+	"github.com/Wei-Shaw/sub2api/internal/pkg/openai"
+	"github.com/Wei-Shaw/sub2api/internal/pkg/openai_compat"
+)
+
+// openaiResponsesProbeTimeout 是探测请求的超时时长。
+// 探测必须快速失败——超时不应阻塞账号创建/更新流程。
+const openaiResponsesProbeTimeout = 8 * time.Second
+
+// openaiResponsesProbePayload 是探测使用的最小 Responses 请求体。
+// 仅作能力探测，不期望响应内容质量；Stream=false 减少 SSE 解析开销。
+//
+// 注意：探测的目标是区分"端点存在"与"端点不存在"——只要上游返回非 404 的
+// 4xx/5xx（如 400 invalid_request_error / 401 unauthorized / 422 等），
+// 都视为"端点存在 → 支持 Responses"。仅 404 / 405 视为"端点不存在"。
+func openaiResponsesProbePayload(modelID string) []byte {
+	if strings.TrimSpace(modelID) == "" {
+		modelID = openai.DefaultTestModel
+	}
+	body, _ := json.Marshal(map[string]any{
+		"model": modelID,
+		"input": []map[string]any{
+			{
+				"role": "user",
+				"content": []map[string]any{
+					{"type": "input_text", "text": "hi"},
+				},
+			},
+		},
+		"instructions": openai.DefaultInstructions,
+		"stream":       false,
+	})
+	return body
+}
+
+// ProbeOpenAIAPIKeyResponsesSupport 探测 OpenAI APIKey 账号上游是否支持
+// /v1/responses 端点，并将结果持久化到 accounts.extra.openai_responses_supported。
+//
+// 调用时机：账号创建/更新后，且仅当 platform=openai && type=apikey 时。
+//
+// 探测策略（参见包文档 internal/pkg/openai_compat）：
+//   - 上游 404 / 405 → 不支持，写 false
+//   - 上游 2xx / 其他 4xx（401/422/400 等）/ 5xx → 支持，写 true
+//   - 网络层失败（连接错误、超时）→ 不写标记，保持 unknown
+//     （后续请求仍按"现状即证据"默认走 Responses）
+//
+// 该方法是幂等的：重复调用会以最新探测结果覆盖标记。
+//
+// 关于失败处理：探测本身的失败不应阻塞账号创建——账号能创建/更新成功就够了，
+// 探测结果只影响后续路由优化。所有错误都仅记录日志，不向调用方传播。
+func (s *AccountTestService) ProbeOpenAIAPIKeyResponsesSupport(ctx context.Context, accountID int64) {
+	account, err := s.accountRepo.GetByID(ctx, accountID)
+	if err != nil {
+		logger.LegacyPrintf("service.openai_probe", "probe_load_account_failed: account_id=%d err=%v", accountID, err)
+		return
+	}
+	if account.Platform != PlatformOpenAI || account.Type != AccountTypeAPIKey {
+		// 仅 OpenAI APIKey 账号需要探测；其他账号类型无能力差异。
+		return
+	}
+
+	apiKey := account.GetOpenAIApiKey()
+	if apiKey == "" {
+		logger.LegacyPrintf("service.openai_probe", "probe_skip_no_apikey: account_id=%d", accountID)
+		return
+	}
+	baseURL := account.GetOpenAIBaseURL()
+	if baseURL == "" {
+		baseURL = "https://api.openai.com"
+	}
+	normalizedBaseURL, err := s.validateUpstreamBaseURL(baseURL)
+	if err != nil {
+		logger.LegacyPrintf("service.openai_probe", "probe_invalid_baseurl: account_id=%d base_url=%q err=%v", accountID, baseURL, err)
+		return
+	}
+
+	probeURL := strings.TrimSuffix(normalizedBaseURL, "/") + "/responses"
+
+	probeCtx, cancel := context.WithTimeout(ctx, openaiResponsesProbeTimeout)
+	defer cancel()
+
+	req, err := http.NewRequestWithContext(probeCtx, http.MethodPost, probeURL, bytes.NewReader(openaiResponsesProbePayload("")))
+	if err != nil {
+		logger.LegacyPrintf("service.openai_probe", "probe_build_request_failed: account_id=%d err=%v", accountID, err)
+		return
+	}
+	req.Header.Set("Content-Type", "application/json")
+	req.Header.Set("Authorization", "Bearer "+apiKey)
+	req.Header.Set("Accept", "application/json")
+
+	proxyURL := ""
+	if account.ProxyID != nil && account.Proxy != nil {
+		proxyURL = account.Proxy.URL()
+	}
+
+	resp, err := s.httpUpstream.DoWithTLS(req, proxyURL, account.ID, account.Concurrency, s.tlsFPProfileService.ResolveTLSProfile(account))
+	if err != nil {
+		// 网络层失败：不写标记，保持 unknown，下次重试或由网关 fallback 处理
+		logger.LegacyPrintf("service.openai_probe", "probe_request_failed: account_id=%d url=%s err=%v", accountID, probeURL, err)
+		return
+	}
+	defer func() {
+		_, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 1<<20))
+		_ = resp.Body.Close()
+	}()
+
+	supported := isResponsesEndpointSupportedByStatus(resp.StatusCode)
+
+	if err := s.accountRepo.UpdateExtra(ctx, accountID, map[string]any{
+		openai_compat.ExtraKeyResponsesSupported: supported,
+	}); err != nil {
+		logger.LegacyPrintf("service.openai_probe", "probe_persist_failed: account_id=%d supported=%v err=%v", accountID, supported, err)
+		return
+	}
+
+	logger.LegacyPrintf("service.openai_probe",
+		"probe_done: account_id=%d base_url=%s status=%d supported=%v",
+		accountID, normalizedBaseURL, resp.StatusCode, supported,
+	)
+}
+
+// isResponsesEndpointSupportedByStatus 根据探测响应的 HTTP 状态码判定上游
+// 是否暴露 /v1/responses 端点。
+//
+// 关键观察：第三方 OpenAI 兼容上游（DeepSeek/Kimi 等）对未知端点统一返回 404
+// 或 405；而 OpenAI 官方/有 Responses 实现的上游会因为请求体最简（缺字段）
+// 返回 400/422 等业务错误，但端点本身存在。
+//
+// 因此：仅 404 和 405 视为"端点不存在"，其他 status 视为"端点存在"。
+//
+// 5xx 也视为"端点存在"——上游偶发故障不应误判为不支持。
+func isResponsesEndpointSupportedByStatus(status int) bool {
+	switch status {
+	case http.StatusNotFound, http.StatusMethodNotAllowed:
+		return false
+	}
+	return true
+}