This tutorial shows how to setup nginx as an advanced proxy, which does the following:
I utilized this setup on my debian packages proxy. It stores all files of the repository which once fetched, but always queries the latest Release files. It works even offline, but you'll get the last known Release file and those packages which were downloaded so far. Excellent to bulk install similar Debian workstations.
Unlike nginx's proxy_cache
facility, which stores cached resources in
hashed dir structure and hashed filenames, proxy_store
can save
individual web resources into directory tree resembling their URL path.
It's useful if you plan to access these cached files directly on the
filesystem, not only via web.
This config provides a web site with http://CACHE-DOMAIN/UPSTREAM/PATH
urls, where CACHE-DOMAIN
is your caching webserver and UPSTREAM
being
the "real" debian repos which you forward proxy to, and PATH
is any path
under the UPSTREAM
domains.
At the end you are good to set similar sources for apt:
deb http://debcache.example.net/archive.debian.org/debian/ wheezy main contrib
Now let's see the config file itself, details in comments:
server
{
# usual options: listen, server_name, access_log, error_log, ...
# where to store cached files
root /srv/web/debcache;
# why not show dir index
autoindex on;
# refuse any request except those which are explicitely handled below
location /
{
return 300 "Not available on this proxy\n";
}
# extract which upstream domain the user wants to access
set $backend_host "";
set $backend_path "";
if ($request_uri ~ ^/([^/]+)/(.*) )
{
set $backend_host $1;
set $backend_path_uri $2;
set_unescape_uri $backend_path $backend_path_uri;
}
# this defines which public repos you are gonna forward proxy to.
# do not allow every websites here because malevolent agents could use you as a gateway proxy.
# UPDATE THIS PATTERN AT THE proxy_redirect DECLARATIONS TOO!
if ($request_uri !~ ^/((ftp(\.(us|de|at|hu))?|archive|packages|deb|security)\.(debian|devuan)\.org/|debian\.ipacct\.com/|mirrors\.dotsrc\.org/(debian|devuan)) )
{
return 300 "this host is not cached: $backend_host\n";
}
# the following locations defines which URLs are served from local cache and
# which ones are searched on source first and then fallback to local.
# files under the /dists/.../by-hash/ dirs are content indexed, don't change,
# serve from local cache, and fetch from the network only if it is not cached
location ~ /by-hash/
{
try_files $uri $uri/default.html @fetch;
include debcache-local;
}
# other files under /dists are generally updated,
# try the online version first, then fallback to local
location ~ /dists/
{
include debcache-fetch;
proxy_intercept_errors on;
error_page 404 500 502 503 504 = @localfallback;
}
# other files outside of /dists are also good to serve locally
location ~ ^/[^/]+/.+
{
try_files $uri $uri/default.html @fetch;
include debcache-local;
}
# this block defines what to do when the online version is not available
location @localfallback
{
try_files $uri $uri/default.html =500;
default_type text/html;
# it's always a good idea to give some cache info and what the upstream responded
add_header X-Cache-Status "STALE from $hostname" always;
add_header X-Upstream-Status "$upstream_status" always;
}
# and this fetched the online version from the upstream
location @fetch
{
include debcache-fetch;
}
}
Now there are 2 config parts which are referenced multiple times: debcache-local and debcache-fetch:
# debcache-local:
default_type text/html;
add_header X-Cache-Status "HIT from $hostname" always;
# these options are not strictly required.
# their purpose is to rewrite (some/most?) links in the text/html content to refer to the proxy host,
# not to the upstream host, so you'll get (mostly) internal links when browsing the debian repo's
# web site via the proxy.
subs_filter '(^| )(HREF|SRC|ACTION)=(["\x27])/(?!/)' '$1$2=$3/$backend_host/' igr;
subs_filter '(<META[^>]*HTTP-EQUIV=["\x27]Refresh["\x27][^>]*CONTENT=["\x27][0-9]+;\s*(URL=)?)/(?!/)' '$1/$backend_host/' igr;
# debcache-fetch:
# proxy options follow here. `proxy_pass` is the main one, it forwards the client's request to the
# upstream site. `$backend_host/$backend_path` is got from the URL path in the original (frontend) request.
proxy_pass http://$backend_host/$backend_path;
# the following options manipulate the request and response headers in order not to confuse
# the client talking to our proxy.
# removing the Accept-Encoding request header is needed to get back non-compressed HTTP payload,
# so we can rewrite the HTML content (see `subs_filter` options).
proxy_set_header Host "$backend_host";
proxy_set_header Referer "";
proxy_set_header Accept-Encoding "";
proxy_hide_header Set-Cookie;
proxy_hide_header Vary;
proxy_hide_header Content-Security-Policy;
proxy_hide_header Expect-CT;
proxy_hide_header Strict-Transport-Security;
proxy_hide_header Upgrade;
proxy_hide_header X-Request-Id;
# good idea to disclose cache and upstream info in the response headers
add_header X-Cache-Status "MISS from $hostname" always;
add_header X-Upstream-Status "$upstream_status" always;
# these `proxy_redirect` options are similar to the `subs_filter` options above as they also
# rewrite URL references from the upstream host to ours, but these are less optional, while
# URL references (links) in HTML are processed mostly only by user-interactive apps, HTTP redirects
# (Location header) are followed by package management apps (`apt-get`) too.
# we want to keep the whole package download process on our proxy, so rewrite known redirects
# and fail others (see `0.0.0.0` target). so if you meet an apt-get trying to access `http://0.0.0.0/...`,
# you may add the indicated domain to the allow list.
proxy_redirect ~*^http://$backend_host(:\d+)?(/.*) /$backend_host$2;
proxy_redirect ~*^(/.*) /$backend_host$1;
# DON'T FORGET TO UPDATE THIS PATTERN IN THE SERVER BLOCK TOO!
proxy_redirect ~*^http://((ftp(\.(us|de|at|hu))?|archive|packages|deb|security)\.(debian|devuan)\.org/|debian\.ipacct\.com/|mirrors\.dotsrc\.org/(debian|devuan))(?<rest>.*) /$1$rest;
proxy_redirect ~(.*) http://0.0.0.0/UNSUPPORTED-REDIRECT/$1;
# content rewrite like above
subs_filter '(^| )(HREF|SRC|ACTION)=(["\x27])/(?!/)' '$1$2=$3/$backend_host/' igr;
subs_filter '(<META[^>]*HTTP-EQUIV=["\x27]Refresh["\x27][^>]*CONTENT=["\x27][0-9]+;\s*(URL=)?)/(?!/)' '$1/$backend_host/' igr;
# this part saves the fetched file to the local cache
# if it was a "directory" (ie. URL ends with "/") then save to "default.html" in that directory.
set $save_path $document_root$uri;
if ($uri ~ /$)
{
set $save_path $document_root${uri}default.html;
}
proxy_store $save_path;
proxy_store_access user:rw group:rw all:r;
Enjoy.