fix(openai-gateway): route APIKey accounts to /v1/chat/completions when upstream lacks Responses API

OpenAI APIKey accounts with base_url pointing to third-party OpenAI-compatible
upstreams (DeepSeek, Kimi, GLM, Qwen, etc.) were failing because the gateway
unconditionally converted Chat Completions requests to Responses format and
forwarded to {base_url}/v1/responses, which only exists on OpenAI's official
endpoint.

Detection-based routing:
- Probe upstream capability on account create/update via a minimal POST to
  /v1/responses; HTTP 404/405 means 'unsupported', any other response means
  'supported'.
- Persist result as accounts.extra.openai_responses_supported (bool).
- ForwardAsChatCompletions branches at function entry: APIKey accounts with
  explicit support=false go through new forwardAsRawChatCompletions which
  passthrough-forwards CC body to /v1/chat/completions without protocol
  conversion.

Default behavior for accounts without the marker preserves the legacy
'always Responses' path — existing OpenAI APIKey accounts that were working
before this change continue to work without modification (the 'reality is
evidence' principle: an account that has been running implies upstream
capability).

Probe is fired async after Create / Update / BatchCreate; failures only log,
never block the admin flow. BulkUpdate omitted (low signal of base_url
changes; can be added if needed).

Implementation:
- New pkg internal/pkg/openai_compat: marker key + ShouldUseResponsesAPI
- New service file openai_apikey_responses_probe.go: probe + persist
- New service file openai_gateway_chat_completions_raw.go: CC pass-through
- Account test endpoint short-circuits with explicit message for
  probed-unsupported accounts (full CC test path is a TODO)

Zero schema changes, zero migrations, zero frontend changes, zero wire
modifications — all wired through existing AccountTestService injection.

Closes: DeepSeek-OpenAI account (id=128) production failure
This commit is contained in:
alfadb-bot
2026-04-30 19:25:45 +08:00
parent 48912014a1
commit 4e4cc80971
7 changed files with 708 additions and 3 deletions

View File

@@ -0,0 +1,149 @@
package service
import (
"bytes"
"context"
"encoding/json"
"io"
"net/http"
"strings"
"time"
"github.com/Wei-Shaw/sub2api/internal/pkg/logger"
"github.com/Wei-Shaw/sub2api/internal/pkg/openai"
"github.com/Wei-Shaw/sub2api/internal/pkg/openai_compat"
)
// openaiResponsesProbeTimeout 是探测请求的超时时长。
// 探测必须快速失败——超时不应阻塞账号创建/更新流程。
const openaiResponsesProbeTimeout = 8 * time.Second
// openaiResponsesProbePayload 是探测使用的最小 Responses 请求体。
// 仅作能力探测不期望响应内容质量Stream=false 减少 SSE 解析开销。
//
// 注意:探测的目标是区分"端点存在"与"端点不存在"——只要上游返回非 404 的
// 4xx/5xx如 400 invalid_request_error / 401 unauthorized / 422 等),
// 都视为"端点存在 → 支持 Responses"。仅 404 / 405 视为"端点不存在"。
func openaiResponsesProbePayload(modelID string) []byte {
if strings.TrimSpace(modelID) == "" {
modelID = openai.DefaultTestModel
}
body, _ := json.Marshal(map[string]any{
"model": modelID,
"input": []map[string]any{
{
"role": "user",
"content": []map[string]any{
{"type": "input_text", "text": "hi"},
},
},
},
"instructions": openai.DefaultInstructions,
"stream": false,
})
return body
}
// ProbeOpenAIAPIKeyResponsesSupport 探测 OpenAI APIKey 账号上游是否支持
// /v1/responses 端点,并将结果持久化到 accounts.extra.openai_responses_supported。
//
// 调用时机:账号创建/更新后,且仅当 platform=openai && type=apikey 时。
//
// 探测策略(参见包文档 internal/pkg/openai_compat
// - 上游 404 / 405 → 不支持,写 false
// - 上游 2xx / 其他 4xx401/422/400 等)/ 5xx → 支持,写 true
// - 网络层失败(连接错误、超时)→ 不写标记,保持 unknown
// (后续请求仍按"现状即证据"默认走 Responses
//
// 该方法是幂等的:重复调用会以最新探测结果覆盖标记。
//
// 关于失败处理:探测本身的失败不应阻塞账号创建——账号能创建/更新成功就够了,
// 探测结果只影响后续路由优化。所有错误都仅记录日志,不向调用方传播。
func (s *AccountTestService) ProbeOpenAIAPIKeyResponsesSupport(ctx context.Context, accountID int64) {
account, err := s.accountRepo.GetByID(ctx, accountID)
if err != nil {
logger.LegacyPrintf("service.openai_probe", "probe_load_account_failed: account_id=%d err=%v", accountID, err)
return
}
if account.Platform != PlatformOpenAI || account.Type != AccountTypeAPIKey {
// 仅 OpenAI APIKey 账号需要探测;其他账号类型无能力差异。
return
}
apiKey := account.GetOpenAIApiKey()
if apiKey == "" {
logger.LegacyPrintf("service.openai_probe", "probe_skip_no_apikey: account_id=%d", accountID)
return
}
baseURL := account.GetOpenAIBaseURL()
if baseURL == "" {
baseURL = "https://api.openai.com"
}
normalizedBaseURL, err := s.validateUpstreamBaseURL(baseURL)
if err != nil {
logger.LegacyPrintf("service.openai_probe", "probe_invalid_baseurl: account_id=%d base_url=%q err=%v", accountID, baseURL, err)
return
}
probeURL := strings.TrimSuffix(normalizedBaseURL, "/") + "/responses"
probeCtx, cancel := context.WithTimeout(ctx, openaiResponsesProbeTimeout)
defer cancel()
req, err := http.NewRequestWithContext(probeCtx, http.MethodPost, probeURL, bytes.NewReader(openaiResponsesProbePayload("")))
if err != nil {
logger.LegacyPrintf("service.openai_probe", "probe_build_request_failed: account_id=%d err=%v", accountID, err)
return
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", "Bearer "+apiKey)
req.Header.Set("Accept", "application/json")
proxyURL := ""
if account.ProxyID != nil && account.Proxy != nil {
proxyURL = account.Proxy.URL()
}
resp, err := s.httpUpstream.DoWithTLS(req, proxyURL, account.ID, account.Concurrency, s.tlsFPProfileService.ResolveTLSProfile(account))
if err != nil {
// 网络层失败:不写标记,保持 unknown下次重试或由网关 fallback 处理
logger.LegacyPrintf("service.openai_probe", "probe_request_failed: account_id=%d url=%s err=%v", accountID, probeURL, err)
return
}
defer func() {
_, _ = io.Copy(io.Discard, io.LimitReader(resp.Body, 1<<20))
_ = resp.Body.Close()
}()
supported := isResponsesEndpointSupportedByStatus(resp.StatusCode)
if err := s.accountRepo.UpdateExtra(ctx, accountID, map[string]any{
openai_compat.ExtraKeyResponsesSupported: supported,
}); err != nil {
logger.LegacyPrintf("service.openai_probe", "probe_persist_failed: account_id=%d supported=%v err=%v", accountID, supported, err)
return
}
logger.LegacyPrintf("service.openai_probe",
"probe_done: account_id=%d base_url=%s status=%d supported=%v",
accountID, normalizedBaseURL, resp.StatusCode, supported,
)
}
// isResponsesEndpointSupportedByStatus 根据探测响应的 HTTP 状态码判定上游
// 是否暴露 /v1/responses 端点。
//
// 关键观察:第三方 OpenAI 兼容上游DeepSeek/Kimi 等)对未知端点统一返回 404
// 或 405而 OpenAI 官方/有 Responses 实现的上游会因为请求体最简(缺字段)
// 返回 400/422 等业务错误,但端点本身存在。
//
// 因此:仅 404 和 405 视为"端点不存在",其他 status 视为"端点存在"。
//
// 5xx 也视为"端点存在"——上游偶发故障不应误判为不支持。
func isResponsesEndpointSupportedByStatus(status int) bool {
switch status {
case http.StatusNotFound, http.StatusMethodNotAllowed:
return false
}
return true
}