pkg-proxy/internal/handler/pypi.go

527 lines
15 KiB
Go
Raw Permalink Normal View History

2026-01-20 21:52:44 +00:00
package handler
import (
"crypto/sha256"
"encoding/hex"
"encoding/json"
"errors"
2026-01-20 21:52:44 +00:00
"fmt"
"io"
"net/http"
"net/url"
"regexp"
"strings"
"time"
"github.com/git-pkgs/purl"
2026-01-20 21:52:44 +00:00
)
const (
pypiUpstream = "https://pypi.org"
minWheelParts = 5 // name + version + python + abi + platform
minSubmatchParts = 2 // full match + first capture group
minPyPIPathParts = 3 // hash_prefix + hash + filename
minPythonTagLen = 2 // minimum length for a python tag (e.g., "py")
2026-01-20 21:52:44 +00:00
)
// PyPIHandler handles PyPI registry protocol requests.
type PyPIHandler struct {
proxy *Proxy
upstreamURL string
proxyURL string
}
// NewPyPIHandler creates a new PyPI protocol handler.
func NewPyPIHandler(proxy *Proxy, proxyURL string) *PyPIHandler {
return &PyPIHandler{
proxy: proxy,
upstreamURL: pypiUpstream,
proxyURL: strings.TrimSuffix(proxyURL, "/"),
}
}
// Routes returns the HTTP handler for PyPI requests.
func (h *PyPIHandler) Routes() http.Handler {
mux := http.NewServeMux()
// Simple API (used by pip)
mux.HandleFunc("GET /simple/", h.handleSimpleIndex)
mux.HandleFunc("GET /simple/{name}/", h.handleSimplePackage)
// JSON API
mux.HandleFunc("GET /pypi/{name}/json", h.handleJSON)
mux.HandleFunc("GET /pypi/{name}/{version}/json", h.handleVersionJSON)
// Package downloads (cache these)
mux.HandleFunc("GET /packages/{path...}", h.handleDownload)
return mux
}
// handleSimpleIndex serves the simple API index.
func (h *PyPIHandler) handleSimpleIndex(w http.ResponseWriter, r *http.Request) {
// Just proxy the index through
h.proxySimple(w, r, "/simple/")
}
// handleSimplePackage serves the simple API package page with rewritten links.
func (h *PyPIHandler) handleSimplePackage(w http.ResponseWriter, r *http.Request) {
name := r.PathValue("name")
if name == "" {
http.Error(w, "invalid package name", http.StatusBadRequest)
return
}
h.proxy.Logger.Info("pypi simple request", "package", name)
upstreamURL := fmt.Sprintf("%s/simple/%s/", h.upstreamURL, name)
cacheKey := name + "/simple"
2026-01-20 21:52:44 +00:00
body, _, err := h.proxy.FetchOrCacheMetadata(r.Context(), "pypi", cacheKey, upstreamURL, "text/html")
2026-01-20 21:52:44 +00:00
if err != nil {
if errors.Is(err, ErrUpstreamNotFound) {
http.Error(w, "not found", http.StatusNotFound)
return
}
2026-01-20 21:52:44 +00:00
h.proxy.Logger.Error("upstream request failed", "error", err)
http.Error(w, "upstream request failed", http.StatusBadGateway)
return
}
// When cooldown is enabled, fetch JSON metadata to get version timestamps
var filteredVersions map[string]bool
if h.proxy.Cooldown != nil && h.proxy.Cooldown.Enabled() {
filteredVersions = h.fetchFilteredVersions(r, name)
}
rewritten := h.rewriteSimpleHTML(body, filteredVersions)
2026-01-20 21:52:44 +00:00
w.Header().Set("Content-Type", "text/html")
w.WriteHeader(http.StatusOK)
_, _ = w.Write(rewritten)
}
// fetchFilteredVersions fetches JSON metadata and returns a set of version strings
// that should be filtered out due to cooldown.
func (h *PyPIHandler) fetchFilteredVersions(r *http.Request, name string) map[string]bool {
jsonURL := fmt.Sprintf("%s/pypi/%s/json", h.upstreamURL, name)
req, err := http.NewRequestWithContext(r.Context(), http.MethodGet, jsonURL, nil)
if err != nil {
return nil
}
req.Header.Set("Accept", "application/json")
resp, err := h.proxy.HTTPClient.Do(req)
if err != nil {
return nil
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
return nil
}
var metadata map[string]any
if err := json.NewDecoder(resp.Body).Decode(&metadata); err != nil {
return nil
}
releases, ok := metadata["releases"].(map[string]any)
if !ok {
return nil
}
packagePURL := purl.MakePURLString("pypi", name, "")
filtered := make(map[string]bool)
for version, files := range releases {
filesArr, ok := files.([]any)
if !ok {
continue
}
publishedAt := h.newestUploadTime(filesArr)
if !publishedAt.IsZero() && !h.proxy.Cooldown.IsAllowed("pypi", packagePURL, publishedAt) {
filtered[version] = true
}
}
if len(filtered) == 0 {
return nil
}
return filtered
}
2026-01-20 21:52:44 +00:00
// rewriteSimpleHTML rewrites package URLs in simple API HTML to point at this proxy.
// If filteredVersions is non-nil, links for those versions are removed entirely.
func (h *PyPIHandler) rewriteSimpleHTML(body []byte, filteredVersions map[string]bool) []byte {
// If cooldown filtering is active, remove entire <a> tags for filtered versions
if len(filteredVersions) > 0 {
// Match full anchor tags: <a ...href="...">filename</a>
linkRe := regexp.MustCompile(`<a[^>]+href="[^"]*"[^>]*>[^<]+</a>`)
body = linkRe.ReplaceAllFunc(body, func(match []byte) []byte {
// Extract filename from between tags
innerRe := regexp.MustCompile(`>([^<]+)</a>`)
innerMatch := innerRe.FindSubmatch(match)
if len(innerMatch) < minSubmatchParts {
return match
}
filename := string(innerMatch[1])
_, version := h.parseFilename(strings.TrimSpace(filename))
if version != "" && filteredVersions[version] {
return nil
}
return match
})
}
2026-01-20 21:52:44 +00:00
// Match href attributes pointing to packages
// PyPI URLs look like: https://files.pythonhosted.org/packages/...
re := regexp.MustCompile(`href="(https://files\.pythonhosted\.org/packages/[^"]+)"`)
return re.ReplaceAllFunc(body, func(match []byte) []byte {
submatch := re.FindSubmatch(match)
if len(submatch) < minSubmatchParts {
2026-01-20 21:52:44 +00:00
return match
}
origURL := string(submatch[1])
u, err := url.Parse(origURL)
if err != nil {
return match
}
newURL := fmt.Sprintf("%s/pypi/packages%s", h.proxyURL, u.Path)
return []byte(fmt.Sprintf(`href="%s"`, newURL))
})
}
// handleJSON serves the JSON API package metadata.
func (h *PyPIHandler) handleJSON(w http.ResponseWriter, r *http.Request) {
name := r.PathValue("name")
if name == "" {
http.Error(w, "invalid package name", http.StatusBadRequest)
return
}
h.proxy.Logger.Info("pypi json request", "package", name)
upstreamURL := fmt.Sprintf("%s/pypi/%s/json", h.upstreamURL, name)
h.proxyAndRewriteJSON(w, r, upstreamURL, name+"/json")
2026-01-20 21:52:44 +00:00
}
// handleVersionJSON serves the JSON API version metadata.
func (h *PyPIHandler) handleVersionJSON(w http.ResponseWriter, r *http.Request) {
name := r.PathValue("name")
version := r.PathValue("version")
if name == "" || version == "" {
http.Error(w, "invalid request", http.StatusBadRequest)
return
}
h.proxy.Logger.Info("pypi version json request", "package", name, "version", version)
upstreamURL := fmt.Sprintf("%s/pypi/%s/%s/json", h.upstreamURL, name, version)
h.proxyAndRewriteJSON(w, r, upstreamURL, name+"/"+version)
2026-01-20 21:52:44 +00:00
}
// proxyAndRewriteJSON fetches JSON metadata and rewrites download URLs.
func (h *PyPIHandler) proxyAndRewriteJSON(w http.ResponseWriter, r *http.Request, upstreamURL, cacheKey string) {
body, _, err := h.proxy.FetchOrCacheMetadata(r.Context(), "pypi", cacheKey, upstreamURL)
2026-01-20 21:52:44 +00:00
if err != nil {
if errors.Is(err, ErrUpstreamNotFound) {
http.Error(w, "not found", http.StatusNotFound)
return
}
2026-01-20 21:52:44 +00:00
h.proxy.Logger.Error("upstream request failed", "error", err)
http.Error(w, "upstream request failed", http.StatusBadGateway)
return
}
rewritten, err := h.rewriteJSONMetadata(body)
if err != nil {
h.proxy.Logger.Warn("failed to rewrite metadata, proxying original", "error", err)
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write(body)
return
}
w.Header().Set("Content-Type", "application/json")
_, _ = w.Write(rewritten)
}
// rewriteJSONMetadata rewrites download URLs in PyPI JSON metadata.
// If cooldown is enabled, versions published too recently are filtered out.
2026-01-20 21:52:44 +00:00
func (h *PyPIHandler) rewriteJSONMetadata(body []byte) ([]byte, error) {
var metadata map[string]any
if err := json.Unmarshal(body, &metadata); err != nil {
return nil, err
}
packageName, _ := extractPyPIName(metadata)
packagePURL := ""
if packageName != "" {
packagePURL = purl.MakePURLString("pypi", packageName, "")
2026-01-20 21:52:44 +00:00
}
h.filterAndRewriteReleases(metadata, packageName, packagePURL)
h.filterAndRewriteURLs(metadata, packagePURL)
return json.Marshal(metadata)
}
// filterAndRewriteReleases applies cooldown filtering and URL rewriting to the
// releases map in PyPI metadata.
func (h *PyPIHandler) filterAndRewriteReleases(metadata map[string]any, packageName, packagePURL string) {
releases, ok := metadata["releases"].(map[string]any)
if !ok {
return
}
for version, files := range releases {
if h.shouldFilterRelease(packagePURL, files) {
h.proxy.Logger.Info("cooldown: filtering pypi version",
"package", packageName, "version", version)
delete(releases, version)
continue
2026-01-20 21:52:44 +00:00
}
h.rewriteFileEntries(files)
2026-01-20 21:52:44 +00:00
}
}
2026-01-20 21:52:44 +00:00
// shouldFilterRelease returns true if a release should be excluded due to cooldown.
func (h *PyPIHandler) shouldFilterRelease(packagePURL string, files any) bool {
if h.proxy.Cooldown == nil || !h.proxy.Cooldown.Enabled() || packagePURL == "" {
return false
}
filesArr, ok := files.([]any)
if !ok {
return false
}
publishedAt := h.newestUploadTime(filesArr)
return !publishedAt.IsZero() && !h.proxy.Cooldown.IsAllowed("pypi", packagePURL, publishedAt)
}
// rewriteFileEntries rewrites URLs in a list of file entries.
func (h *PyPIHandler) rewriteFileEntries(files any) {
filesArr, ok := files.([]any)
if !ok {
return
}
for _, f := range filesArr {
if fmap, ok := f.(map[string]any); ok {
h.rewriteURLEntry(fmap)
}
}
}
// filterAndRewriteURLs applies cooldown filtering and URL rewriting to the
// urls array (current version files) in PyPI metadata.
func (h *PyPIHandler) filterAndRewriteURLs(metadata map[string]any, packagePURL string) {
urls, ok := metadata["urls"].([]any)
if !ok {
return
}
if h.shouldFilterRelease(packagePURL, urls) {
metadata["urls"] = []any{}
}
if urls, ok := metadata["urls"].([]any); ok {
for _, u := range urls {
if umap, ok := u.(map[string]any); ok {
h.rewriteURLEntry(umap)
}
}
}
2026-01-20 21:52:44 +00:00
}
// extractPyPIName extracts the package name from PyPI JSON metadata.
func extractPyPIName(metadata map[string]any) (string, bool) {
info, ok := metadata["info"].(map[string]any)
if !ok {
return "", false
}
name, ok := info["name"].(string)
return name, ok
}
// newestUploadTime returns the most recent upload_time_iso_8601 from a list of file entries.
func (h *PyPIHandler) newestUploadTime(files []any) time.Time {
var newest time.Time
for _, f := range files {
fmap, ok := f.(map[string]any)
if !ok {
continue
}
ts, ok := fmap["upload_time_iso_8601"].(string)
if !ok {
continue
}
t, err := time.Parse(time.RFC3339, ts)
if err != nil {
continue
}
if t.After(newest) {
newest = t
}
}
return newest
}
2026-01-20 21:52:44 +00:00
// rewriteURLEntry rewrites a single URL entry in PyPI metadata.
func (h *PyPIHandler) rewriteURLEntry(entry map[string]any) {
urlStr, ok := entry["url"].(string)
if !ok {
return
}
u, err := url.Parse(urlStr)
if err != nil {
return
}
// Only rewrite pythonhosted.org URLs
if u.Host == "files.pythonhosted.org" {
newURL := fmt.Sprintf("%s/pypi/packages%s", h.proxyURL, u.Path)
entry["url"] = newURL
}
}
// handleDownload serves a package file, fetching and caching from upstream if needed.
func (h *PyPIHandler) handleDownload(w http.ResponseWriter, r *http.Request) {
path := r.PathValue("path")
if path == "" {
http.Error(w, "invalid path", http.StatusBadRequest)
return
}
// Path format: /packages/{hash_prefix}/{hash}/{filename}
// e.g., /packages/ab/cd/abc123.../requests-2.31.0.tar.gz
parts := strings.Split(path, "/")
if len(parts) < minPyPIPathParts {
2026-01-20 21:52:44 +00:00
http.Error(w, "invalid path", http.StatusBadRequest)
return
}
filename := parts[len(parts)-1]
name, version := h.parseFilename(filename)
if name == "" {
// Can't determine name/version, use hash as identifier
name = fmt.Sprintf("_hash_%s", hashPath(path))
version = "0"
}
h.proxy.Logger.Info("pypi download request",
"name", name, "version", version, "filename", filename)
// Construct upstream URL; the incoming path starts with
// '/packages' so there is no need to include it in the format
// string
upstreamURL := fmt.Sprintf("https://files.pythonhosted.org/%s", path)
2026-01-20 21:52:44 +00:00
result, err := h.proxy.GetOrFetchArtifactFromURL(r.Context(), "pypi", name, version, filename, upstreamURL)
if err != nil {
h.proxy.Logger.Error("failed to get artifact", "error", err)
http.Error(w, "failed to fetch package", http.StatusBadGateway)
return
}
ServeArtifact(w, result)
}
// parseFilename extracts package name and version from a PyPI filename.
// Handles both wheels and sdists:
// - requests-2.31.0-py3-none-any.whl
// - requests-2.31.0.tar.gz
func (h *PyPIHandler) parseFilename(filename string) (name, version string) {
// Try wheel format first: {name}-{version}(-{build})?-{python}-{abi}-{platform}.whl
if strings.HasSuffix(filename, ".whl") {
base := strings.TrimSuffix(filename, ".whl")
parts := strings.Split(base, "-")
if len(parts) >= minWheelParts {
2026-01-20 21:52:44 +00:00
// Find where version ends (version followed by python tag)
for i := 1; i < len(parts)-2; i++ {
// Check if this looks like a python tag (py2, py3, cp39, etc)
if isPythonTag(parts[i]) {
name = strings.Join(parts[:i-1], "-")
version = parts[i-1]
return
}
}
}
}
// Try sdist formats: {name}-{version}.tar.gz, {name}-{version}.zip
for _, ext := range []string{".tar.gz", ".tar.bz2", ".zip", ".tar"} {
if strings.HasSuffix(filename, ext) {
base := strings.TrimSuffix(filename, ext)
// Find last hyphen followed by version
for i := len(base) - 1; i >= 0; i-- {
if base[i] == '-' && i+1 < len(base) && isVersionStart(base[i+1]) {
return base[:i], base[i+1:]
}
}
}
}
return "", ""
}
func isPythonTag(s string) bool {
if len(s) < minPythonTagLen {
2026-01-20 21:52:44 +00:00
return false
}
// Python tags start with py, cp, pp, ip, jy
prefixes := []string{"py", "cp", "pp", "ip", "jy"}
for _, p := range prefixes {
if strings.HasPrefix(s, p) {
return true
}
}
return false
}
func isVersionStart(c byte) bool {
return c >= '0' && c <= '9'
}
func hashPath(path string) string {
h := sha256.Sum256([]byte(path))
return hex.EncodeToString(h[:8])
}
// proxySimple proxies a simple API request.
func (h *PyPIHandler) proxySimple(w http.ResponseWriter, r *http.Request, path string) {
upstreamURL := h.upstreamURL + path
req, err := http.NewRequestWithContext(r.Context(), http.MethodGet, upstreamURL, nil)
if err != nil {
http.Error(w, "failed to create request", http.StatusInternalServerError)
return
}
req.Header.Set("Accept", "text/html")
resp, err := h.proxy.HTTPClient.Do(req)
2026-01-20 21:52:44 +00:00
if err != nil {
h.proxy.Logger.Error("upstream request failed", "error", err)
http.Error(w, "upstream request failed", http.StatusBadGateway)
return
}
defer func() { _ = resp.Body.Close() }()
for k, vv := range resp.Header {
for _, v := range vv {
w.Header().Add(k, v)
}
}
w.WriteHeader(resp.StatusCode)
_, _ = io.Copy(w, resp.Body)
}