Nginx caching-fallback proxy

This tutorial shows how to setup nginx as an advanced proxy, which does the following:

I utilized this setup on my debian packages proxy. It stores all files of the repository which once fetched, but always queries the latest Release files. It works even offline, but you'll get the last known Release file and those packages which were downloaded so far. Excellent to bulk install similar Debian workstations.

Unlike nginx's proxy_cache facility, which stores cached resources in hashed dir structure and hashed filenames, proxy_store can save individual web resources into directory tree resembling their URL path. It's useful if you plan to access these cached files directly on the filesystem, not only via web.

This config provides a web site with http://CACHE-DOMAIN/UPSTREAM/PATH urls, where CACHE-DOMAIN is your caching webserver and UPSTREAM being the "real" debian repos which you forward proxy to, and PATH is any path under the UPSTREAM domains.

At the end you are good to set similar sources for apt:

deb http://debcache.example.net/archive.debian.org/debian/ wheezy main contrib

Now let's see the config file itself, details in comments:

server
{
    # usual options: listen, server_name, access_log, error_log, ...

    # where to store cached files

        root /srv/web/debcache;

    # why not show dir index

        autoindex on;

    # refuse any request except those which are explicitely handled below

        location /
        {
            return 300 "Not available on this proxy\n";
        }

    # extract which upstream domain the user wants to access

        set $backend_host "";
        set $backend_path "";

        if ($request_uri ~ ^/([^/]+)/(.*) )
        {
            set $backend_host $1;
            set $backend_path_uri $2;
            set_unescape_uri $backend_path $backend_path_uri;
        }

    # this defines which public repos you are gonna forward proxy to.
    # do not allow every websites here because malevolent agents could use you as a gateway proxy.
    # UPDATE THIS PATTERN AT THE proxy_redirect DECLARATIONS TOO!

        if ($request_uri !~ ^/((ftp(\.(us|de|at|hu))?|archive|packages|deb|security)\.(debian|devuan)\.org/|debian\.ipacct\.com/|mirrors\.dotsrc\.org/(debian|devuan)) )
        {
            return 300 "this host is not cached: $backend_host\n";
        }

    # the following locations defines which URLs are served from local cache and
    # which ones are searched on source first and then fallback to local.

    # files under the /dists/.../by-hash/ dirs are content indexed, don't change,
    # serve from local cache, and fetch from the network only if it is not cached

        location ~ /by-hash/
        {
            try_files $uri $uri/default.html @fetch;
            include debcache-local;
        }

    # other files under /dists are generally updated,
    # try the online version first, then fallback to local

        location ~ /dists/
        {
            include debcache-fetch;
            proxy_intercept_errors on;
            error_page 404 500 502 503 504 = @localfallback;
        }

    # other files outside of /dists are also good to serve locally

        location ~ ^/[^/]+/.+
        {
            try_files $uri $uri/default.html @fetch;
            include debcache-local;
        }

    # this block defines what to do when the online version is not available

        location @localfallback
        {
            try_files $uri $uri/default.html =500;
            default_type text/html;

            # it's always a good idea to give some cache info and what the upstream responded

            add_header X-Cache-Status "STALE from $hostname" always;
            add_header X-Upstream-Status "$upstream_status" always;
        }

    # and this fetched the online version from the upstream

        location @fetch
        {
            include debcache-fetch;
        }
}

Now there are 2 config parts which are referenced multiple times: debcache-local and debcache-fetch:

# debcache-local:

    default_type text/html;
    add_header X-Cache-Status "HIT from $hostname" always;

    # these options are not strictly required.
    # their purpose is to rewrite (some/most?) links in the text/html content to refer to the proxy host, 
    # not to the upstream host, so you'll get (mostly) internal links when browsing the debian repo's 
    # web site via the proxy.

        subs_filter '(^| )(HREF|SRC|ACTION)=(["\x27])/(?!/)' '$1$2=$3/$backend_host/' igr;
        subs_filter '(<META[^>]*HTTP-EQUIV=["\x27]Refresh["\x27][^>]*CONTENT=["\x27][0-9]+;\s*(URL=)?)/(?!/)' '$1/$backend_host/' igr;


# debcache-fetch:

    # proxy options follow here. `proxy_pass` is the main one, it forwards the client's request to the
    # upstream site. `$backend_host/$backend_path` is got from the URL path in the original (frontend) request.

        proxy_pass http://$backend_host/$backend_path;

    # the following options manipulate the request and response headers in order not to confuse 
    # the client talking to our proxy.
    # removing the Accept-Encoding request header is needed to get back non-compressed HTTP payload,
    # so we can rewrite the HTML content (see `subs_filter` options).

        proxy_set_header Host "$backend_host";
        proxy_set_header Referer "";
        proxy_set_header Accept-Encoding "";
        proxy_hide_header Set-Cookie;
        proxy_hide_header Vary;
        proxy_hide_header Content-Security-Policy;
        proxy_hide_header Expect-CT;
        proxy_hide_header Strict-Transport-Security;
        proxy_hide_header Upgrade;
        proxy_hide_header X-Request-Id;

    # good idea to disclose cache and upstream info in the response headers

        add_header X-Cache-Status "MISS from $hostname" always;
        add_header X-Upstream-Status "$upstream_status" always;

    # these `proxy_redirect` options are similar to the `subs_filter` options above as they also
    # rewrite URL references from the upstream host to ours, but these are less optional, while
    # URL references (links) in HTML are processed mostly only by user-interactive apps, HTTP redirects
    # (Location header) are followed by package management apps (`apt-get`) too.
    # we want to keep the whole package download process on our proxy, so rewrite known redirects
    # and fail others (see `0.0.0.0` target). so if you meet an apt-get trying to access `http://0.0.0.0/...`,
    # you may add the indicated domain to the allow list.

        proxy_redirect ~*^http://$backend_host(:\d+)?(/.*) /$backend_host$2;
        proxy_redirect ~*^(/.*) /$backend_host$1;
        # DON'T FORGET TO UPDATE THIS PATTERN IN THE SERVER BLOCK TOO!
        proxy_redirect ~*^http://((ftp(\.(us|de|at|hu))?|archive|packages|deb|security)\.(debian|devuan)\.org/|debian\.ipacct\.com/|mirrors\.dotsrc\.org/(debian|devuan))(?<rest>.*) /$1$rest;
        proxy_redirect ~(.*) http://0.0.0.0/UNSUPPORTED-REDIRECT/$1;

    # content rewrite like above

        subs_filter '(^| )(HREF|SRC|ACTION)=(["\x27])/(?!/)' '$1$2=$3/$backend_host/' igr;
        subs_filter '(<META[^>]*HTTP-EQUIV=["\x27]Refresh["\x27][^>]*CONTENT=["\x27][0-9]+;\s*(URL=)?)/(?!/)' '$1/$backend_host/' igr;

    # this part saves the fetched file to the local cache
    # if it was a "directory" (ie. URL ends with "/") then save to "default.html" in that directory.

        set $save_path $document_root$uri;
        if ($uri ~ /$)
        {
            set $save_path $document_root${uri}default.html;
        }
        proxy_store $save_path;
        proxy_store_access user:rw group:rw all:r;

Enjoy.