Compare commits

..

14 commits

Author SHA1 Message Date
Quentin Dufour
85aca61860
[SKIP CI] WIP merkle todo encapsulation 2025-05-02 08:18:46 +02:00
Quentin Dufour
46ebfdba66
set config 2025-05-01 16:30:09 +02:00
Quentin Dufour
ee8fa687ad
add permits to merkle worker 2025-05-01 13:53:38 +02:00
Quentin Dufour
fa457328c8
[SKIP CI] add forget semaphore, add them back not yet implemented 2025-05-01 10:05:04 +02:00
Quentin Dufour
f34558af07
add a rpc in-flight limiter 2025-05-01 08:56:57 +02:00
Quentin Dufour
d78e5f8a1b
fix logic 2025-04-30 15:53:12 +02:00
Quentin Dufour
3172f875ae
pass config 2025-04-30 14:41:16 +02:00
Quentin Dufour
11a6417d11
try to better track queue len evol 2025-04-30 09:01:50 +02:00
Quentin Dufour
b0a9e007bd
try another approach to backpressure 2025-04-30 08:49:44 +02:00
Quentin Dufour
904548d1d1
allow up to 30sec 2025-04-30 08:38:28 +02:00
Quentin Dufour
6cc79bc696
react slower 2025-04-30 08:37:35 +02:00
Quentin Dufour
60b3d28f93
add an opentelemetry metric 2025-04-30 07:24:20 +02:00
Quentin Dufour
7fddf0af9c
first implementation 2025-04-29 10:50:47 +02:00
Quentin Dufour
78882f4040
add a backpressure system 2025-04-29 09:38:47 +02:00
109 changed files with 1824 additions and 3343 deletions

View file

@ -1,6 +1,3 @@
labels:
nix: "enabled"
when: when:
event: event:
- push - push
@ -12,32 +9,27 @@ when:
steps: steps:
- name: check formatting - name: check formatting
image: nixpkgs/nix:nixos-24.05 image: nixpkgs/nix:nixos-22.05
commands: commands:
- nix-build -j4 --attr flakePackages.fmt - nix-shell --attr devShell --run "cargo fmt -- --check"
- name: build - name: build
image: nixpkgs/nix:nixos-24.05 image: nixpkgs/nix:nixos-22.05
commands: commands:
- nix-build -j4 --attr flakePackages.dev - nix-build -j4 --attr flakePackages.dev
- name: unit + func tests (lmdb) - name: unit + func tests (lmdb)
image: nixpkgs/nix:nixos-24.05 image: nixpkgs/nix:nixos-22.05
commands: commands:
- nix-build -j4 --attr flakePackages.tests-lmdb - nix-build -j4 --attr flakePackages.tests-lmdb
- name: unit + func tests (sqlite) - name: unit + func tests (sqlite)
image: nixpkgs/nix:nixos-24.05 image: nixpkgs/nix:nixos-22.05
commands: commands:
- nix-build -j4 --attr flakePackages.tests-sqlite - nix-build -j4 --attr flakePackages.tests-sqlite
- name: unit + func tests (fjall)
image: nixpkgs/nix:nixos-24.05
commands:
- nix-build -j4 --attr flakePackages.tests-fjall
- name: integration tests - name: integration tests
image: nixpkgs/nix:nixos-24.05 image: nixpkgs/nix:nixos-22.05
commands: commands:
- nix-build -j4 --attr flakePackages.dev - nix-build -j4 --attr flakePackages.dev
- nix-shell --attr ci --run ./script/test-smoke.sh || (cat /tmp/garage.log; false) - nix-shell --attr ci --run ./script/test-smoke.sh || (cat /tmp/garage.log; false)

View file

@ -1,6 +1,3 @@
labels:
nix: "enabled"
when: when:
event: event:
- deployment - deployment
@ -11,7 +8,7 @@ depends_on:
steps: steps:
- name: refresh-index - name: refresh-index
image: nixpkgs/nix:nixos-24.05 image: nixpkgs/nix:nixos-22.05
environment: environment:
AWS_ACCESS_KEY_ID: AWS_ACCESS_KEY_ID:
from_secret: garagehq_aws_access_key_id from_secret: garagehq_aws_access_key_id
@ -22,7 +19,7 @@ steps:
- nix-shell --attr ci --run "refresh_index" - nix-shell --attr ci --run "refresh_index"
- name: multiarch-docker - name: multiarch-docker
image: nixpkgs/nix:nixos-24.05 image: nixpkgs/nix:nixos-22.05
environment: environment:
DOCKER_AUTH: DOCKER_AUTH:
from_secret: docker_auth from_secret: docker_auth

View file

@ -1,6 +1,3 @@
labels:
nix: "enabled"
when: when:
event: event:
- deployment - deployment
@ -19,17 +16,17 @@ matrix:
steps: steps:
- name: build - name: build
image: nixpkgs/nix:nixos-24.05 image: nixpkgs/nix:nixos-22.05
commands: commands:
- nix-build --attr releasePackages.${ARCH} --argstr git_version ${CI_COMMIT_TAG:-$CI_COMMIT_SHA} - nix-build --attr releasePackages.${ARCH} --argstr git_version ${CI_COMMIT_TAG:-$CI_COMMIT_SHA}
- name: check is static binary - name: check is static binary
image: nixpkgs/nix:nixos-24.05 image: nixpkgs/nix:nixos-22.05
commands: commands:
- nix-shell --attr ci --run "./script/not-dynamic.sh result/bin/garage" - nix-shell --attr ci --run "./script/not-dynamic.sh result/bin/garage"
- name: integration tests - name: integration tests
image: nixpkgs/nix:nixos-24.05 image: nixpkgs/nix:nixos-22.05
commands: commands:
- nix-shell --attr ci --run ./script/test-smoke.sh || (cat /tmp/garage.log; false) - nix-shell --attr ci --run ./script/test-smoke.sh || (cat /tmp/garage.log; false)
when: when:
@ -39,7 +36,7 @@ steps:
ARCH: i386 ARCH: i386
- name: upgrade tests - name: upgrade tests
image: nixpkgs/nix:nixos-24.05 image: nixpkgs/nix:nixos-22.05
commands: commands:
- nix-shell --attr ci --run "./script/test-upgrade.sh v0.8.4 x86_64-unknown-linux-musl" || (cat /tmp/garage.log; false) - nix-shell --attr ci --run "./script/test-upgrade.sh v0.8.4 x86_64-unknown-linux-musl" || (cat /tmp/garage.log; false)
when: when:
@ -47,7 +44,7 @@ steps:
ARCH: amd64 ARCH: amd64
- name: push static binary - name: push static binary
image: nixpkgs/nix:nixos-24.05 image: nixpkgs/nix:nixos-22.05
environment: environment:
TARGET: "${TARGET}" TARGET: "${TARGET}"
AWS_ACCESS_KEY_ID: AWS_ACCESS_KEY_ID:
@ -58,7 +55,7 @@ steps:
- nix-shell --attr ci --run "to_s3" - nix-shell --attr ci --run "to_s3"
- name: docker build and publish - name: docker build and publish
image: nixpkgs/nix:nixos-24.05 image: nixpkgs/nix:nixos-22.05
environment: environment:
DOCKER_PLATFORM: "linux/${ARCH}" DOCKER_PLATFORM: "linux/${ARCH}"
CONTAINER_NAME: "dxflrs/${ARCH}_garage" CONTAINER_NAME: "dxflrs/${ARCH}_garage"

1966
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -24,18 +24,18 @@ default-members = ["src/garage"]
# Internal Garage crates # Internal Garage crates
format_table = { version = "0.1.1", path = "src/format-table" } format_table = { version = "0.1.1", path = "src/format-table" }
garage_api_common = { version = "1.3.1", path = "src/api/common" } garage_api_common = { version = "1.1.0", path = "src/api/common" }
garage_api_admin = { version = "1.3.1", path = "src/api/admin" } garage_api_admin = { version = "1.1.0", path = "src/api/admin" }
garage_api_s3 = { version = "1.3.1", path = "src/api/s3" } garage_api_s3 = { version = "1.1.0", path = "src/api/s3" }
garage_api_k2v = { version = "1.3.1", path = "src/api/k2v" } garage_api_k2v = { version = "1.1.0", path = "src/api/k2v" }
garage_block = { version = "1.3.1", path = "src/block" } garage_block = { version = "1.1.0", path = "src/block" }
garage_db = { version = "1.3.1", path = "src/db", default-features = false } garage_db = { version = "1.1.0", path = "src/db", default-features = false }
garage_model = { version = "1.3.1", path = "src/model", default-features = false } garage_model = { version = "1.1.0", path = "src/model", default-features = false }
garage_net = { version = "1.3.1", path = "src/net" } garage_net = { version = "1.1.0", path = "src/net" }
garage_rpc = { version = "1.3.1", path = "src/rpc" } garage_rpc = { version = "1.1.0", path = "src/rpc" }
garage_table = { version = "1.3.1", path = "src/table" } garage_table = { version = "1.1.0", path = "src/table" }
garage_util = { version = "1.3.1", path = "src/util" } garage_util = { version = "1.1.0", path = "src/util" }
garage_web = { version = "1.3.1", path = "src/web" } garage_web = { version = "1.1.0", path = "src/web" }
k2v-client = { version = "0.0.4", path = "src/k2v-client" } k2v-client = { version = "0.0.4", path = "src/k2v-client" }
# External crates from crates.io # External crates from crates.io
@ -52,11 +52,13 @@ chrono = "0.4"
crc32fast = "1.4" crc32fast = "1.4"
crc32c = "0.6" crc32c = "0.6"
crypto-common = "0.1" crypto-common = "0.1"
err-derive = "0.3"
gethostname = "0.4" gethostname = "0.4"
git-version = "0.3.4" git-version = "0.3.4"
hex = "0.4" hex = "0.4"
hexdump = "0.1" hexdump = "0.1"
hmac = "0.12" hmac = "0.12"
idna = "0.5"
itertools = "0.12" itertools = "0.12"
ipnet = "2.9.0" ipnet = "2.9.0"
lazy_static = "1.4" lazy_static = "1.4"
@ -64,7 +66,6 @@ md-5 = "0.10"
mktemp = "0.5" mktemp = "0.5"
nix = { version = "0.29", default-features = false, features = ["fs"] } nix = { version = "0.29", default-features = false, features = ["fs"] }
nom = "7.1" nom = "7.1"
parking_lot = "0.12"
parse_duration = "2.1" parse_duration = "2.1"
pin-project = "1.0.12" pin-project = "1.0.12"
pnet_datalink = "0.34" pnet_datalink = "0.34"
@ -83,14 +84,12 @@ pretty_env_logger = "0.5"
structopt = { version = "0.3", default-features = false } structopt = { version = "0.3", default-features = false }
syslog-tracing = "0.3" syslog-tracing = "0.3"
tracing = "0.1" tracing = "0.1"
tracing-journald = "0.3.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] } tracing-subscriber = { version = "0.3", features = ["env-filter"] }
heed = { version = "0.11", default-features = false, features = ["lmdb"] } heed = { version = "0.11", default-features = false, features = ["lmdb"] }
rusqlite = "0.37" rusqlite = "0.31.0"
r2d2 = "0.8" r2d2 = "0.8"
r2d2_sqlite = "0.31" r2d2_sqlite = "0.24"
fjall = "2.4"
async-compression = { version = "0.4", features = ["tokio", "zstd"] } async-compression = { version = "0.4", features = ["tokio", "zstd"] }
zstd = { version = "0.13", default-features = false } zstd = { version = "0.13", default-features = false }
@ -136,7 +135,7 @@ prometheus = "0.13"
aws-sigv4 = { version = "1.1", default-features = false } aws-sigv4 = { version = "1.1", default-features = false }
hyper-rustls = { version = "0.26", default-features = false, features = ["http1", "http2", "ring", "rustls-native-certs"] } hyper-rustls = { version = "0.26", default-features = false, features = ["http1", "http2", "ring", "rustls-native-certs"] }
log = "0.4" log = "0.4"
thiserror = "2.0" thiserror = "1.0"
# ---- used only as build / dev dependencies ---- # ---- used only as build / dev dependencies ----
assert-json-diff = "2.0" assert-json-diff = "2.0"
@ -146,8 +145,12 @@ aws-smithy-runtime = { version = "1.8", default-features = false, features = ["t
aws-sdk-config = { version = "1.62", default-features = false } aws-sdk-config = { version = "1.62", default-features = false }
aws-sdk-s3 = { version = "1.79", default-features = false, features = ["rt-tokio"] } aws-sdk-s3 = { version = "1.79", default-features = false, features = ["rt-tokio"] }
[profile.dev]
#lto = "thin" # disabled for now, adds 2-4 min to each CI build
lto = "off"
[profile.release] [profile.release]
lto = "thin" lto = true
codegen-units = 16 codegen-units = 1
opt-level = 3 opt-level = "s"
strip = "debuginfo" strip = true

View file

@ -12,7 +12,7 @@ In this section, we cover the following web applications:
| [Mastodon](#mastodon) | ✅ | Natively supported | | [Mastodon](#mastodon) | ✅ | Natively supported |
| [Matrix](#matrix) | ✅ | Tested with `synapse-s3-storage-provider` | | [Matrix](#matrix) | ✅ | Tested with `synapse-s3-storage-provider` |
| [ejabberd](#ejabberd) | ✅ | `mod_s3_upload` | | [ejabberd](#ejabberd) | ✅ | `mod_s3_upload` |
| [Pixelfed](#pixelfed) | ✅ | Natively supported | | [Pixelfed](#pixelfed) | ❓ | Not yet tested |
| [Pleroma](#pleroma) | ❓ | Not yet tested | | [Pleroma](#pleroma) | ❓ | Not yet tested |
| [Lemmy](#lemmy) | ✅ | Supported with pict-rs | | [Lemmy](#lemmy) | ✅ | Supported with pict-rs |
| [Funkwhale](#funkwhale) | ❓ | Not yet tested | | [Funkwhale](#funkwhale) | ❓ | Not yet tested |
@ -191,10 +191,10 @@ garage key create peertube-key
Keep the Key ID and the Secret key in a pad, they will be needed later. Keep the Key ID and the Secret key in a pad, they will be needed later.
We need two buckets, one for normal videos (named peertube-videos) and one for webtorrent videos (named peertube-playlists). We need two buckets, one for normal videos (named peertube-video) and one for webtorrent videos (named peertube-playlist).
```bash ```bash
garage bucket create peertube-videos garage bucket create peertube-videos
garage bucket create peertube-playlists garage bucket create peertube-playlist
``` ```
Now we allow our key to read and write on these buckets: Now we allow our key to read and write on these buckets:
@ -253,7 +253,7 @@ object_storage:
proxify_private_files: false proxify_private_files: false
streaming_playlists: streaming_playlists:
bucket_name: 'peertube-playlists' bucket_name: 'peertube-playlist'
# Keep it empty for our example # Keep it empty for our example
prefix: '' prefix: ''

View file

@ -161,49 +161,3 @@ kopia repository validate-provider
You can then run all the standard kopia commands: `kopia snapshot create`, `kopia mount`... You can then run all the standard kopia commands: `kopia snapshot create`, `kopia mount`...
Everything should work out-of-the-box. Everything should work out-of-the-box.
## Plakar
Create your key and bucket on Garage server:
```bash
garage key create my-plakar-key
garage bucket create plakar-backups
garage bucket allow plakar-backups --read --write --key my-plakar-key
```
On Plakar server, add your Garage as a storage location:
```bash
plakar store add garageS3 s3://my-garage.tld/plakar-backups \
region=garage # Or as you've specified in garage.toml \
access_key=<Key ID from "garage key info my-plakar-key"> \
secret_access_key=<Secret key from "garage key info my-plakar-key">
```
Then create the repository.
```bash
plakar at @garageS3 create -plaintext # Unencrypted
# or
plakar at @garageS3 create #encrypted
```
If you encrypt your backups (Plakar default), you will need to define a strong passphrase. Do not forget to save your password safely. It will be needed to decrypt your backups.
After the repository has been created, check that everything works as expected (that might give an empty result as no file has been added yet, but no error message):
```bash
plakar at @garageS3 check
```
Now that everything is configure, you can use Garage as your backups storage. For instance sync it with a local backup storage:
```bash
$ plakar at ~/backups sync to @garageS3
```
Or list the S3 storage content:
```bash
$ plakar at @garageS3 ls
```
More information in Plakar documentation: https://www.plakar.io/docs/main/quickstart/

View file

@ -8,18 +8,18 @@ have published Ansible roles. We list them and compare them below.
## Comparison of Ansible roles ## Comparison of Ansible roles
| Feature | [ansible-role-garage](#zorun-ansible-role-garage) | [garage-docker-ansible-deploy](#moan0s-garage-docker-ansible-deploy) | [eddster ansible-role-garage](#eddster-ansible-role-garage) | | Feature | [ansible-role-garage](#zorun-ansible-role-garage) | [garage-docker-ansible-deploy](#moan0s-garage-docker-ansible-deploy) |
|------------------------------------|---------------------------------------------|---------------------------------------------------------------|---------------------------------| |------------------------------------|---------------------------------------------|---------------------------------------------------------------|
| **Runtime** | Systemd | Docker | Systemd | | **Runtime** | Systemd | Docker |
| **Target OS** | Any Linux | Any Linux | Any Linux | | **Target OS** | Any Linux | Any Linux |
| **Architecture** | amd64, arm64, i686 | amd64, arm64 | arm64, arm, 386, amd64 | | **Architecture** | amd64, arm64, i686 | amd64, arm64 |
| **Additional software** | None | Traefik | Ngnix and Keepalived (optional) | | **Additional software** | None | Traefik |
| **Automatic node connection** | ❌ | ✅ | ✅ | | **Automatic node connection** | ❌ | ✅ |
| **Layout management** | ❌ | ✅ | ✅ | | **Layout management** | ❌ | ✅ |
| **Manage buckets & keys** | ❌ | ✅ (basic) | ✅ | | **Manage buckets & keys** | ❌ | ✅ (basic) |
| **Allow custom Garage config** | ✅ | ❌ | ❌ | | **Allow custom Garage config** | ✅ | ❌ |
| **Facilitate Garage upgrades** | ✅ | ❌ | ✅ | | **Facilitate Garage upgrades** | ✅ | ❌ |
| **Multiple instances on one host** | ✅ | ✅ | ❌ | | **Multiple instances on one host** | ✅ | ✅ |
## zorun/ansible-role-garage ## zorun/ansible-role-garage
@ -49,15 +49,3 @@ structured DNS names, etc).
As a result, this role makes it easier to start with Garage on Ansible, As a result, this role makes it easier to start with Garage on Ansible,
but is less flexible. but is less flexible.
## eddster2309/ansible-role-garage
[Source code](https://github.com/eddster2309/ansible-role-garage), [Ansible galaxy](https://galaxy.ansible.com/ui/standalone/roles/eddster2309/garage/)
This role is a opinionated but customisable role using the official Garage
static binaries and only requires Systemd. As such it should work on any
Linux based host. It includes all the nesscary configuration to
automatically setup a clustered Garage deployment. Most Garage
configuration options are exposed through Ansible variables so while you
can't provide a custom config you can get very close. It can optionally
installed a HA nginx deployment with Keepalived.

View file

@ -15,10 +15,9 @@ Alpine Linux repositories (available since v3.17):
apk add garage apk add garage
``` ```
The default configuration file is installed to `/etc/garage/garage.toml`. You can run The default configuration file is installed to `/etc/garage.toml`. You can run
Garage using: `rc-service garage start`. Garage using: `rc-service garage start`. If you don't specify `rpc_secret`, it
will be automatically replaced with a random string on the first start.
If you don't specify `rpc_secret`, it will be automatically replaced with a random string on the first start.
Please note that this package is built without Consul discovery, Kubernetes Please note that this package is built without Consul discovery, Kubernetes
discovery, OpenTelemetry exporter, and K2V features (K2V will be enabled once discovery, OpenTelemetry exporter, and K2V features (K2V will be enabled once
@ -27,7 +26,7 @@ it's stable).
## Arch Linux ## Arch Linux
Garage is available in the official repositories under [extra](https://archlinux.org/packages/extra/x86_64/garage). Garage is available in the [AUR](https://aur.archlinux.org/packages/garage).
## FreeBSD ## FreeBSD

View file

@ -11,7 +11,7 @@ Firstly clone the repository:
```bash ```bash
git clone https://git.deuxfleurs.fr/Deuxfleurs/garage git clone https://git.deuxfleurs.fr/Deuxfleurs/garage
cd garage/script/helm cd garage/scripts/helm
``` ```
Deploy with default options: Deploy with default options:
@ -26,13 +26,6 @@ Or deploy with custom values:
helm install --create-namespace --namespace garage garage ./garage -f values.override.yaml helm install --create-namespace --namespace garage garage ./garage -f values.override.yaml
``` ```
If you want to manage the CustomRessourceDefinition used by garage for its `kubernetes_discovery` outside of the helm chart, add `garage.kubernetesSkipCrd: true` to your custom values and use the kustomization before deploying the helm chart:
```bash
kubectl apply -k ../k8s/crd
helm install --create-namespace --namespace garage garage ./garage -f values.override.yaml
```
After deploying, cluster layout must be configured manually as described in [Creating a cluster layout](@/documentation/quick-start/_index.md#creating-a-cluster-layout). Use the following command to access garage CLI: After deploying, cluster layout must be configured manually as described in [Creating a cluster layout](@/documentation/quick-start/_index.md#creating-a-cluster-layout). Use the following command to access garage CLI:
```bash ```bash

View file

@ -96,14 +96,14 @@ to store 2 TB of data in total.
## Get a Docker image ## Get a Docker image
Our docker image is currently named `dxflrs/garage` and is stored on the [Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated). Our docker image is currently named `dxflrs/garage` and is stored on the [Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated).
We encourage you to use a fixed tag (eg. `v1.3.0`) and not the `latest` tag. We encourage you to use a fixed tag (eg. `v1.1.0`) and not the `latest` tag.
For this example, we will use the latest published version at the time of the writing which is `v1.3.0` but it's up to you For this example, we will use the latest published version at the time of the writing which is `v1.1.0` but it's up to you
to check [the most recent versions on the Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated). to check [the most recent versions on the Docker Hub](https://hub.docker.com/r/dxflrs/garage/tags?page=1&ordering=last_updated).
For example: For example:
``` ```
sudo docker pull dxflrs/garage:v1.3.0 sudo docker pull dxflrs/garage:v1.1.0
``` ```
## Deploying and configuring Garage ## Deploying and configuring Garage
@ -171,7 +171,7 @@ docker run \
-v /etc/garage.toml:/etc/garage.toml \ -v /etc/garage.toml:/etc/garage.toml \
-v /var/lib/garage/meta:/var/lib/garage/meta \ -v /var/lib/garage/meta:/var/lib/garage/meta \
-v /var/lib/garage/data:/var/lib/garage/data \ -v /var/lib/garage/data:/var/lib/garage/data \
dxflrs/garage:v1.3.0 dxflrs/garage:v1.1.0
``` ```
With this command line, Garage should be started automatically at each boot. With this command line, Garage should be started automatically at each boot.
@ -185,7 +185,7 @@ If you want to use `docker-compose`, you may use the following `docker-compose.y
version: "3" version: "3"
services: services:
garage: garage:
image: dxflrs/garage:v1.3.0 image: dxflrs/garage:v1.1.0
network_mode: "host" network_mode: "host"
restart: unless-stopped restart: unless-stopped
volumes: volumes:

View file

@ -28,7 +28,6 @@ StateDirectory=garage
DynamicUser=true DynamicUser=true
ProtectHome=true ProtectHome=true
NoNewPrivileges=true NoNewPrivileges=true
LimitNOFILE=42000
[Install] [Install]
WantedBy=multi-user.target WantedBy=multi-user.target

View file

@ -132,7 +132,7 @@ docker run \
-v /path/to/garage.toml:/etc/garage.toml \ -v /path/to/garage.toml:/etc/garage.toml \
-v /path/to/garage/meta:/var/lib/garage/meta \ -v /path/to/garage/meta:/var/lib/garage/meta \
-v /path/to/garage/data:/var/lib/garage/data \ -v /path/to/garage/data:/var/lib/garage/data \
dxflrs/garage:v1.3.0 dxflrs/garage:v1.1.0
``` ```
Under Linux, you can substitute `--network host` for `-p 3900:3900 -p 3901:3901 -p 3902:3902 -p 3903:3903` Under Linux, you can substitute `--network host` for `-p 3900:3900 -p 3901:3901 -p 3902:3902 -p 3903:3903`
@ -182,12 +182,11 @@ ID Hostname Address Tag Zone Capacit
## Creating a cluster layout ## Creating a cluster layout
Creating a cluster layout for a Garage deployment means informing Garage Creating a cluster layout for a Garage deployment means informing Garage
of the disk space available on each node of the cluster, `-c`, of the disk space available on each node of the cluster
as well as the name of the zone (e.g. datacenter), `-z`, each machine is located in. as well as the zone (e.g. datacenter) each machine is located in.
For our test deployment, we are have only one node with zone named `dc1` and a For our test deployment, we are using only one node. The way in which we configure
capacity of `1G`, though the capacity is ignored for a single node deployment it does not matter, you can simply write:
and can be changed later when adding new nodes.
```bash ```bash
garage layout assign -z dc1 -c 1G <node_id> garage layout assign -z dc1 -c 1G <node_id>

View file

@ -24,8 +24,7 @@ db_engine = "lmdb"
block_size = "1M" block_size = "1M"
block_ram_buffer_max = "256MiB" block_ram_buffer_max = "256MiB"
block_max_concurrent_reads = 16
block_max_concurrent_writes_per_request =10
lmdb_map_size = "1T" lmdb_map_size = "1T"
compression_level = 1 compression_level = 1
@ -47,7 +46,6 @@ bootstrap_peers = [
"212fd62eeaca72c122b45a7f4fa0f55e012aa5e24ac384a72a3016413fa724ff@[fc00:F::1]:3901", "212fd62eeaca72c122b45a7f4fa0f55e012aa5e24ac384a72a3016413fa724ff@[fc00:F::1]:3901",
] ]
allow_punycode = false
[consul_discovery] [consul_discovery]
api = "catalog" api = "catalog"
@ -94,32 +92,29 @@ The following gives details about each available configuration option.
[Environment variables](#env_variables). [Environment variables](#env_variables).
Top-level configuration options, in alphabetical order: Top-level configuration options:
[`allow_punycode`](#allow_punycode),
[`allow_world_readable_secrets`](#allow_world_readable_secrets), [`allow_world_readable_secrets`](#allow_world_readable_secrets),
[`block_max_concurrent_reads`](`block_max_concurrent_reads),
[`block_ram_buffer_max`](#block_ram_buffer_max), [`block_ram_buffer_max`](#block_ram_buffer_max),
[`block_max_concurrent_writes_per_request`](#block_max_concurrent_writes_per_request),
[`block_size`](#block_size), [`block_size`](#block_size),
[`bootstrap_peers`](#bootstrap_peers), [`bootstrap_peers`](#bootstrap_peers),
[`compression_level`](#compression_level), [`compression_level`](#compression_level),
[`consistency_mode`](#consistency_mode),
[`data_dir`](#data_dir), [`data_dir`](#data_dir),
[`data_fsync`](#data_fsync), [`data_fsync`](#data_fsync),
[`db_engine`](#db_engine), [`db_engine`](#db_engine),
[`disable_scrub`](#disable_scrub), [`disable_scrub`](#disable_scrub),
[`use_local_tz`](#use_local_tz),
[`lmdb_map_size`](#lmdb_map_size), [`lmdb_map_size`](#lmdb_map_size),
[`metadata_auto_snapshot_interval`](#metadata_auto_snapshot_interval), [`metadata_auto_snapshot_interval`](#metadata_auto_snapshot_interval),
[`metadata_dir`](#metadata_dir), [`metadata_dir`](#metadata_dir),
[`metadata_fsync`](#metadata_fsync), [`metadata_fsync`](#metadata_fsync),
[`metadata_snapshots_dir`](#metadata_snapshots_dir), [`metadata_snapshots_dir`](#metadata_snapshots_dir),
[`replication_factor`](#replication_factor), [`replication_factor`](#replication_factor),
[`consistency_mode`](#consistency_mode),
[`rpc_bind_addr`](#rpc_bind_addr), [`rpc_bind_addr`](#rpc_bind_addr),
[`rpc_bind_outgoing`](#rpc_bind_outgoing), [`rpc_bind_outgoing`](#rpc_bind_outgoing),
[`rpc_public_addr`](#rpc_public_addr), [`rpc_public_addr`](#rpc_public_addr),
[`rpc_public_addr_subnet`](#rpc_public_addr_subnet) [`rpc_public_addr_subnet`](#rpc_public_addr_subnet)
[`rpc_secret`/`rpc_secret_file`](#rpc_secret), [`rpc_secret`/`rpc_secret_file`](#rpc_secret).
[`use_local_tz`](#use_local_tz).
The `[consul_discovery]` section: The `[consul_discovery]` section:
[`api`](#consul_api), [`api`](#consul_api),
@ -156,17 +151,13 @@ The `[admin]` section:
### Environment variables {#env_variables} ### Environment variables {#env_variables}
The following configuration parameters must be specified as environment variables, The following configuration parameter must be specified as an environment
they do not exist in the configuration file: variable, it does not exist in the configuration file:
- `GARAGE_LOG_TO_SYSLOG` (since `v0.9.4`): set this to `1` or `true` to make the - `GARAGE_LOG_TO_SYSLOG` (since `v0.9.4`): set this to `1` or `true` to make the
Garage daemon send its logs to `syslog` (using the libc `syslog` function) Garage daemon send its logs to `syslog` (using the libc `syslog` function)
instead of printing to stderr. instead of printing to stderr.
- `GARAGE_LOG_TO_JOURNALD` (since `v1.2.0`): set this to `1` or `true` to make the
Garage daemon send its logs to `journald` (using the native protocol of `systemd-journald`)
instead of printing to stderr.
The following environment variables can be used to override the corresponding The following environment variables can be used to override the corresponding
values in the configuration file: values in the configuration file:
@ -178,7 +169,7 @@ values in the configuration file:
### Top-level configuration options ### Top-level configuration options
#### `replication_factor` (since `v1.0.0`) {#replication_factor} #### `replication_factor` {#replication_factor}
The replication factor can be any positive integer smaller or equal the node count in your cluster. The replication factor can be any positive integer smaller or equal the node count in your cluster.
The chosen replication factor has a big impact on the cluster's failure tolerancy and performance characteristics. The chosen replication factor has a big impact on the cluster's failure tolerancy and performance characteristics.
@ -226,7 +217,7 @@ is in progress. In theory, no data should be lost as rebalancing is a
routine operation for Garage, although we cannot guarantee you that everything routine operation for Garage, although we cannot guarantee you that everything
will go right in such an extreme scenario. will go right in such an extreme scenario.
#### `consistency_mode` (since `v1.0.0`) {#consistency_mode} #### `consistency_mode` {#consistency_mode}
The consistency mode setting determines the read and write behaviour of your cluster. The consistency mode setting determines the read and write behaviour of your cluster.
@ -336,7 +327,6 @@ Since `v0.8.0`, Garage can use alternative storage backends as follows:
| --------- | ----------------- | ------------- | | --------- | ----------------- | ------------- |
| [LMDB](https://www.symas.com/lmdb) (since `v0.8.0`, default since `v0.9.0`) | `"lmdb"` | `<metadata_dir>/db.lmdb/` | | [LMDB](https://www.symas.com/lmdb) (since `v0.8.0`, default since `v0.9.0`) | `"lmdb"` | `<metadata_dir>/db.lmdb/` |
| [Sqlite](https://sqlite.org) (since `v0.8.0`) | `"sqlite"` | `<metadata_dir>/db.sqlite` | | [Sqlite](https://sqlite.org) (since `v0.8.0`) | `"sqlite"` | `<metadata_dir>/db.sqlite` |
| [Fjall](https://github.com/fjall-rs/fjall) (**experimental support** since `v1.3.0`) | `"fjall"` | `<metadata_dir>/db.fjall/` |
| [Sled](https://sled.rs) (old default, removed since `v1.0`) | `"sled"` | `<metadata_dir>/db/` | | [Sled](https://sled.rs) (old default, removed since `v1.0`) | `"sled"` | `<metadata_dir>/db/` |
Sled was supported until Garage v0.9.x, and was removed in Garage v1.0. Sled was supported until Garage v0.9.x, and was removed in Garage v1.0.
@ -373,14 +363,6 @@ LMDB works very well, but is known to have the following limitations:
so it is not the best choice for high-performance storage clusters, so it is not the best choice for high-performance storage clusters,
but it should work fine in many cases. but it should work fine in many cases.
- Fjall: a storage engine based on LSM trees, which theoretically allow for
higher write throughput than other storage engines that are based on B-trees.
Using Fjall could potentially improve Garage's performance significantly in
write-heavy workloads. **Support for Fjall is experimental at this point**,
we have added it to Garage for evaluation purposes only. **Do not use it for
production-critical workloads.**
It is possible to convert Garage's metadata directory from one format to another It is possible to convert Garage's metadata directory from one format to another
using the `garage convert-db` command, which should be used as follows: using the `garage convert-db` command, which should be used as follows:
@ -418,7 +400,6 @@ Here is how this option impacts the different database engines:
|----------|------------------------------------|-------------------------------| |----------|------------------------------------|-------------------------------|
| Sqlite | `PRAGMA synchronous = OFF` | `PRAGMA synchronous = NORMAL` | | Sqlite | `PRAGMA synchronous = OFF` | `PRAGMA synchronous = NORMAL` |
| LMDB | `MDB_NOMETASYNC` + `MDB_NOSYNC` | `MDB_NOMETASYNC` | | LMDB | `MDB_NOMETASYNC` + `MDB_NOSYNC` | `MDB_NOMETASYNC` |
| Fjall | default options | not supported |
Note that the Sqlite database is always ran in `WAL` mode (`PRAGMA journal_mode = WAL`). Note that the Sqlite database is always ran in `WAL` mode (`PRAGMA journal_mode = WAL`).
@ -525,37 +506,6 @@ node.
The default value is 256MiB. The default value is 256MiB.
#### `block_max_concurrent_reads` (since `v1.3.0` / `v2.1.0`) {#block_max_concurrent_reads}
The maximum number of blocks (individual files in the data directory) open
simultaneously for reading.
Reducing this number does not limit the number of data blocks that can be
transferred through the network simultaneously. This mechanism was just added
as a backpressure mechanism for HDD read speed: it helps avoid a situation
where too many requests are coming in and Garage is reading too many block
files simultaneously, thus not making timely progress on any of the reads.
When a request to read a data block comes in through the network, the requests
awaits for one of the `block_max_concurrent_reads` slots to be available
(internally implemented using a Semaphore object). Once it acquired a read
slot, it reads the entire block file to RAM and frees the slot as soon as the
block file is finished reading. Only after the slot is released will the
block's data start being transferred over the network. If the request fails to
acquire a reading slot wihtin 15 seconds, it fails with a timeout error.
Timeout events can be monitored through the `block_read_semaphore_timeouts`
metric in Prometheus: a non-zero number of such events indicates an I/O
bottleneck on HDD read speed.
#### `block_max_concurrent_writes_per_request` (since `v2.1.0`) {#block_max_concurrent_writes_per_request}
This parameter is designed to adapt to the concurrent write performance of
different storage media.Maximum number of parallel block writes per put request
Higher values improve throughput but increase memory usage.
Default: 3, Recommended: 10-30 for NVMe, 3-10 for HDD
#### `lmdb_map_size` {#lmdb_map_size} #### `lmdb_map_size` {#lmdb_map_size}
This parameters can be used to set the map size used by LMDB, This parameters can be used to set the map size used by LMDB,
@ -654,7 +604,7 @@ be obtained by running `garage node id` and then included directly in the
key will be returned by `garage node id` and you will have to add the IP key will be returned by `garage node id` and you will have to add the IP
yourself. yourself.
#### `allow_world_readable_secrets` or `GARAGE_ALLOW_WORLD_READABLE_SECRETS` (env) {#allow_world_readable_secrets} ### `allow_world_readable_secrets` or `GARAGE_ALLOW_WORLD_READABLE_SECRETS` (env) {#allow_world_readable_secrets}
Garage checks the permissions of your secret files to make sure they're not Garage checks the permissions of your secret files to make sure they're not
world-readable. In some cases, the check might fail and consider your files as world-readable. In some cases, the check might fail and consider your files as
@ -666,13 +616,6 @@ permission verification.
Alternatively, you can set the `GARAGE_ALLOW_WORLD_READABLE_SECRETS` Alternatively, you can set the `GARAGE_ALLOW_WORLD_READABLE_SECRETS`
environment variable to `true` to bypass the permissions check. environment variable to `true` to bypass the permissions check.
#### `allow_punycode` {#allow_punycode}
Allow creating buckets with names containing punycode. When used for buckets served
as websites, this allows using almost any unicode character in the domain name.
Default to `false`.
### The `[consul_discovery]` section ### The `[consul_discovery]` section
Garage supports discovering other nodes of the cluster using Consul. For this Garage supports discovering other nodes of the cluster using Consul. For this

View file

@ -23,17 +23,17 @@ Feel free to open a PR to suggest fixes this table. Minio is missing because the
- 2022-05-25 - Many Ceph S3 endpoints are not documented but implemented. Following a notification from the Ceph community, we added them. - 2022-05-25 - Many Ceph S3 endpoints are not documented but implemented. Following a notification from the Ceph community, we added them.
## High-level features ## High-level features
| Feature | Garage | [Openstack Swift](https://docs.openstack.org/swift/latest/s3_compat.html) | [Ceph Object Gateway](https://docs.ceph.com/en/latest/radosgw/s3/) | [Riak CS](https://docs.riak.com/riak/cs/2.1.1/references/apis/storage/s3/index.html) | [OpenIO](https://docs.openio.io/latest/source/arch-design/s3_compliancy.html) | | Feature | Garage | [Openstack Swift](https://docs.openstack.org/swift/latest/s3_compat.html) | [Ceph Object Gateway](https://docs.ceph.com/en/latest/radosgw/s3/) | [Riak CS](https://docs.riak.com/riak/cs/2.1.1/references/apis/storage/s3/index.html) | [OpenIO](https://docs.openio.io/latest/source/arch-design/s3_compliancy.html) |
|------------------------------|----------------------------------|-----------------|---------------|---------|-----| |------------------------------|----------------------------------|-----------------|---------------|---------|-----|
| [signature v2](https://docs.aws.amazon.com/AmazonS3/latest/API/Appendix-Sigv2.html) (deprecated) | ❌ Missing | ✅ | ✅ | ✅ | ✅ | | [signature v2](https://docs.aws.amazon.com/general/latest/gr/signature-version-2.html) (deprecated) | ❌ Missing | ✅ | ✅ | ✅ | ✅ |
| [signature v4](https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-authenticating-requests.html) | ✅ Implemented | ✅ | ✅ | ❌ | ✅ | | [signature v4](https://docs.aws.amazon.com/AmazonS3/latest/API/sig-v4-authenticating-requests.html) | ✅ Implemented | ✅ | ✅ | ❌ | ✅ |
| [URL path-style](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#path-style-access) (eg. `host.tld/bucket/key`) | ✅ Implemented | ✅ | ✅ | ❓| ✅ | | [URL path-style](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#path-style-access) (eg. `host.tld/bucket/key`) | ✅ Implemented | ✅ | ✅ | ❓| ✅ |
| [URL vhost-style](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#virtual-hosted-style-access) URL (eg. `bucket.host.tld/key`) | ✅ Implemented | ❌| ✅| ✅ | ✅ | | [URL vhost-style](https://docs.aws.amazon.com/AmazonS3/latest/userguide/VirtualHosting.html#virtual-hosted-style-access) URL (eg. `bucket.host.tld/key`) | ✅ Implemented | ❌| ✅| ✅ | ✅ |
| [Presigned URLs](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ShareObjectPreSignedURL.html) | ✅ Implemented | ❌| ✅ | ✅ | ✅(❓) | | [Presigned URLs](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ShareObjectPreSignedURL.html) | ✅ Implemented | ❌| ✅ | ✅ | ✅(❓) |
| [SSE-C encryption](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html) | ✅ Implemented | ❓ | ✅ | ❌ | ✅ | | [SSE-C encryption](https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerSideEncryptionCustomerKeys.html) | ✅ Implemented | ❓ | ✅ | ❌ | ✅ |
| [Bucket versioning](https://docs.aws.amazon.com/AmazonS3/latest/userguide/Versioning.html) | ❌ Missing | ✅ | ✅ | ❌ | ✅ |
*Note:* OpenIO does not says if it supports presigned URLs. Because it is part *Note:* OpenIO does not says if it supports presigned URLs. Because it is part
of signature v4 and they claim they support it without additional precisions, of signature v4 and they claim they support it without additional precisions,

View file

@ -70,7 +70,7 @@ Example response body:
```json ```json
{ {
"node": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df", "node": "b10c110e4e854e5aa3f4637681befac755154b20059ec163254ddbfae86b09df",
"garageVersion": "v1.3.0", "garageVersion": "v1.1.0",
"garageFeatures": [ "garageFeatures": [
"k2v", "k2v",
"lmdb", "lmdb",

16
flake.lock generated
View file

@ -50,17 +50,17 @@
}, },
"nixpkgs": { "nixpkgs": {
"locked": { "locked": {
"lastModified": 1763977559, "lastModified": 1736692550,
"narHash": "sha256-g4MKqsIRy5yJwEsI+fYODqLUnAqIY4kZai0nldAP6EM=", "narHash": "sha256-7tk8xH+g0sJkKLTJFOxphJxxOjMDFMWv24nXslaU2ro=",
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "cfe2c7d5b5d3032862254e68c37a6576b633d632", "rev": "7c4869c47090dd7f9f1bdfb49a22aea026996815",
"type": "github" "type": "github"
}, },
"original": { "original": {
"owner": "NixOS", "owner": "NixOS",
"repo": "nixpkgs", "repo": "nixpkgs",
"rev": "cfe2c7d5b5d3032862254e68c37a6576b633d632", "rev": "7c4869c47090dd7f9f1bdfb49a22aea026996815",
"type": "github" "type": "github"
} }
}, },
@ -80,17 +80,17 @@
] ]
}, },
"locked": { "locked": {
"lastModified": 1763952169, "lastModified": 1738549608,
"narHash": "sha256-+PeDBD8P+NKauH+w7eO/QWCIp8Cx4mCfWnh9sJmy9CM=", "narHash": "sha256-GdyT9QEUSx5k/n8kILuNy83vxxdyUfJ8jL5mMpQZWfw=",
"owner": "oxalica", "owner": "oxalica",
"repo": "rust-overlay", "repo": "rust-overlay",
"rev": "ab726555a9a72e6dc80649809147823a813fa95b", "rev": "35c6f8c4352f995ecd53896200769f80a3e8f22d",
"type": "github" "type": "github"
}, },
"original": { "original": {
"owner": "oxalica", "owner": "oxalica",
"repo": "rust-overlay", "repo": "rust-overlay",
"rev": "ab726555a9a72e6dc80649809147823a813fa95b", "rev": "35c6f8c4352f995ecd53896200769f80a3e8f22d",
"type": "github" "type": "github"
} }
}, },

View file

@ -2,13 +2,13 @@
description = description =
"Garage, an S3-compatible distributed object store for self-hosted deployments"; "Garage, an S3-compatible distributed object store for self-hosted deployments";
# Nixpkgs 25.05 as of 2025-11-24 # Nixpkgs 24.11 as of 2025-01-12
inputs.nixpkgs.url = inputs.nixpkgs.url =
"github:NixOS/nixpkgs/cfe2c7d5b5d3032862254e68c37a6576b633d632"; "github:NixOS/nixpkgs/7c4869c47090dd7f9f1bdfb49a22aea026996815";
# Rust overlay as of 2025-11-24 # Rust overlay as of 2025-02-03
inputs.rust-overlay.url = inputs.rust-overlay.url =
"github:oxalica/rust-overlay/ab726555a9a72e6dc80649809147823a813fa95b"; "github:oxalica/rust-overlay/35c6f8c4352f995ecd53896200769f80a3e8f22d";
inputs.rust-overlay.inputs.nixpkgs.follows = "nixpkgs"; inputs.rust-overlay.inputs.nixpkgs.follows = "nixpkgs";
inputs.crane.url = "github:ipetkov/crane"; inputs.crane.url = "github:ipetkov/crane";
@ -30,10 +30,6 @@
inherit system nixpkgs crane rust-overlay extraTestEnv; inherit system nixpkgs crane rust-overlay extraTestEnv;
release = false; release = false;
}).garage-test; }).garage-test;
lints = (compile {
inherit system nixpkgs crane rust-overlay;
release = false;
});
in in
{ {
packages = { packages = {
@ -57,13 +53,6 @@
tests-sqlite = testWith { tests-sqlite = testWith {
GARAGE_TEST_INTEGRATION_DB_ENGINE = "sqlite"; GARAGE_TEST_INTEGRATION_DB_ENGINE = "sqlite";
}; };
tests-fjall = testWith {
GARAGE_TEST_INTEGRATION_DB_ENGINE = "fjall";
};
# lints (fmt, clippy)
fmt = lints.garage-cargo-fmt;
clippy = lints.garage-cargo-clippy;
}; };
# ---- developpment shell, for making native builds only ---- # ---- developpment shell, for making native builds only ----

View file

@ -48,7 +48,7 @@ let
inherit (pkgs) lib stdenv; inherit (pkgs) lib stdenv;
toolchainFn = (p: p.rust-bin.stable."1.91.0".default.override { toolchainFn = (p: p.rust-bin.stable."1.82.0".default.override {
targets = lib.optionals (target != null) [ rustTarget ]; targets = lib.optionals (target != null) [ rustTarget ];
extensions = [ extensions = [
"rust-src" "rust-src"
@ -68,13 +68,12 @@ let
rootFeatures = if features != null then rootFeatures = if features != null then
features features
else else
([ "bundled-libs" "lmdb" "sqlite" "fjall" "k2v" ] ++ (lib.optionals release [ ([ "bundled-libs" "lmdb" "sqlite" "k2v" ] ++ (lib.optionals release [
"consul-discovery" "consul-discovery"
"kubernetes-discovery" "kubernetes-discovery"
"metrics" "metrics"
"telemetry-otlp" "telemetry-otlp"
"syslog" "syslog"
"journald"
])); ]));
featuresStr = lib.concatStringsSep "," rootFeatures; featuresStr = lib.concatStringsSep "," rootFeatures;
@ -190,15 +189,4 @@ in rec {
pkgs.cacert pkgs.cacert
]; ];
} // extraTestEnv); } // extraTestEnv);
# ---- source code linting ----
garage-cargo-fmt = craneLib.cargoFmt (commonArgs // {
cargoExtraArgs = "";
});
garage-cargo-clippy = craneLib.cargoClippy (commonArgs // {
cargoArtifacts = garage-deps;
cargoClippyExtraArgs = "--all-targets -- -D warnings";
});
} }

View file

@ -1,7 +1,6 @@
export AWS_ACCESS_KEY_ID=`cat /tmp/garage.s3 |cut -d' ' -f1` export AWS_ACCESS_KEY_ID=`cat /tmp/garage.s3 |cut -d' ' -f1`
export AWS_SECRET_ACCESS_KEY=`cat /tmp/garage.s3 |cut -d' ' -f2` export AWS_SECRET_ACCESS_KEY=`cat /tmp/garage.s3 |cut -d' ' -f2`
export AWS_DEFAULT_REGION='garage' export AWS_DEFAULT_REGION='garage'
export AWS_REQUEST_CHECKSUM_CALCULATION='when_required'
# FUTUREWORK: set AWS_ENDPOINT_URL instead, once nixpkgs bumps awscli to >=2.13.0. # FUTUREWORK: set AWS_ENDPOINT_URL instead, once nixpkgs bumps awscli to >=2.13.0.
function aws { command aws --endpoint-url http://127.0.0.1:3911 $@ ; } function aws { command aws --endpoint-url http://127.0.0.1:3911 $@ ; }

View file

@ -2,8 +2,8 @@ apiVersion: v2
name: garage name: garage
description: S3-compatible object store for small self-hosted geo-distributed deployments description: S3-compatible object store for small self-hosted geo-distributed deployments
type: application type: application
version: 0.7.3 version: 0.7.0
appVersion: "v1.3.1" appVersion: "v1.1.0"
home: https://garagehq.deuxfleurs.fr/ home: https://garagehq.deuxfleurs.fr/
icon: https://garagehq.deuxfleurs.fr/images/garage-logo.svg icon: https://garagehq.deuxfleurs.fr/images/garage-logo.svg

View file

@ -1,6 +1,6 @@
# garage # garage
![Version: 0.7.3](https://img.shields.io/badge/Version-0.7.3-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v1.3.1](https://img.shields.io/badge/AppVersion-v1.3.1-informational?style=flat-square) ![Version: 0.7.0](https://img.shields.io/badge/Version-0.7.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: v1.1.0](https://img.shields.io/badge/AppVersion-v1.1.0-informational?style=flat-square)
S3-compatible object store for small self-hosted geo-distributed deployments S3-compatible object store for small self-hosted geo-distributed deployments

View file

@ -4,10 +4,6 @@ metadata:
name: {{ include "garage.fullname" . }} name: {{ include "garage.fullname" . }}
labels: labels:
{{- include "garage.labels" . | nindent 4 }} {{- include "garage.labels" . | nindent 4 }}
{{- with .Values.service.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec: spec:
type: {{ .Values.service.type }} type: {{ .Values.service.type }}
ports: ports:

View file

@ -124,8 +124,6 @@ service:
# - NodePort (+ Ingress) # - NodePort (+ Ingress)
# - LoadBalancer # - LoadBalancer
type: ClusterIP type: ClusterIP
# -- Annotations to add to the service
annotations: {}
s3: s3:
api: api:
port: 3900 port: 3900

View file

@ -1,43 +0,0 @@
apiVersion: apiextensions.k8s.io/v1
kind: CustomResourceDefinition
metadata:
name: garagenodes.deuxfleurs.fr
spec:
conversion:
strategy: None
group: deuxfleurs.fr
names:
kind: GarageNode
listKind: GarageNodeList
plural: garagenodes
singular: garagenode
scope: Namespaced
versions:
- name: v1
schema:
openAPIV3Schema:
description: Auto-generated derived type for Node via `CustomResource`
properties:
spec:
properties:
address:
format: ip
type: string
hostname:
type: string
port:
format: uint16
minimum: 0
type: integer
required:
- address
- hostname
- port
type: object
required:
- spec
title: GarageNode
type: object
served: true
storage: true
subresources: {}

View file

@ -1,5 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- garagenodes.deuxfleurs.fr.yaml

View file

@ -34,8 +34,6 @@ in
jq jq
]; ];
shellHook = '' shellHook = ''
export AWS_REQUEST_CHECKSUM_CALCULATION='when_required'
function to_s3 { function to_s3 {
aws \ aws \
--endpoint-url https://garage.deuxfleurs.fr \ --endpoint-url https://garage.deuxfleurs.fr \

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_api_admin" name = "garage_api_admin"
version = "1.3.1" version = "1.1.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"
@ -22,7 +22,7 @@ garage_api_common.workspace = true
argon2.workspace = true argon2.workspace = true
async-trait.workspace = true async-trait.workspace = true
thiserror.workspace = true err-derive.workspace = true
hex.workspace = true hex.workspace = true
tracing.workspace = true tracing.workspace = true

View file

@ -277,7 +277,7 @@ pub async fn handle_create_bucket(
let helper = garage.locked_helper().await; let helper = garage.locked_helper().await;
if let Some(ga) = &req.global_alias { if let Some(ga) = &req.global_alias {
if !is_valid_bucket_name(ga, garage.config.allow_punycode) { if !is_valid_bucket_name(ga) {
return Err(Error::bad_request(format!( return Err(Error::bad_request(format!(
"{}: {}", "{}: {}",
ga, INVALID_BUCKET_NAME_MESSAGE ga, INVALID_BUCKET_NAME_MESSAGE
@ -292,7 +292,7 @@ pub async fn handle_create_bucket(
} }
if let Some(la) = &req.local_alias { if let Some(la) = &req.local_alias {
if !is_valid_bucket_name(&la.alias, garage.config.allow_punycode) { if !is_valid_bucket_name(&la.alias) {
return Err(Error::bad_request(format!( return Err(Error::bad_request(format!(
"{}: {}", "{}: {}",
la.alias, INVALID_BUCKET_NAME_MESSAGE la.alias, INVALID_BUCKET_NAME_MESSAGE
@ -382,7 +382,7 @@ pub async fn handle_delete_bucket(
for ((key_id, alias), _, active) in state.local_aliases.items().iter() { for ((key_id, alias), _, active) in state.local_aliases.items().iter() {
if *active { if *active {
helper helper
.purge_local_bucket_alias(bucket.id, key_id, alias) .unset_local_bucket_alias(bucket.id, key_id, alias)
.await?; .await?;
} }
} }

View file

@ -1,8 +1,8 @@
use std::convert::TryFrom; use std::convert::TryFrom;
use err_derive::Error;
use hyper::header::HeaderValue; use hyper::header::HeaderValue;
use hyper::{HeaderMap, StatusCode}; use hyper::{HeaderMap, StatusCode};
use thiserror::Error;
pub use garage_model::helper::error::Error as HelperError; pub use garage_model::helper::error::Error as HelperError;
@ -16,17 +16,20 @@ use garage_api_common::helpers::*;
/// Errors of this crate /// Errors of this crate
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum Error { pub enum Error {
#[error("{0}")] #[error(display = "{}", _0)]
/// Error from common error /// Error from common error
Common(#[from] CommonError), Common(#[error(source)] CommonError),
// Category: cannot process // Category: cannot process
/// The API access key does not exist /// The API access key does not exist
#[error("Access key not found: {0}")] #[error(display = "Access key not found: {}", _0)]
NoSuchAccessKey(String), NoSuchAccessKey(String),
/// In Import key, the key already exists /// In Import key, the key already exists
#[error("Key {0} already exists in data store. Even if it is deleted, we can't let you create a new key with the same ID. Sorry.")] #[error(
display = "Key {} already exists in data store. Even if it is deleted, we can't let you create a new key with the same ID. Sorry.",
_0
)]
KeyAlreadyExists(String), KeyAlreadyExists(String),
} }

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_api_common" name = "garage_api_common"
version = "1.3.1" version = "1.1.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"
@ -24,10 +24,11 @@ chrono.workspace = true
crc32fast.workspace = true crc32fast.workspace = true
crc32c.workspace = true crc32c.workspace = true
crypto-common.workspace = true crypto-common.workspace = true
thiserror.workspace = true err-derive.workspace = true
hex.workspace = true hex.workspace = true
hmac.workspace = true hmac.workspace = true
md-5.workspace = true md-5.workspace = true
idna.workspace = true
tracing.workspace = true tracing.workspace = true
nom.workspace = true nom.workspace = true
pin-project.workspace = true pin-project.workspace = true

View file

@ -1,7 +1,7 @@
use std::convert::TryFrom; use std::convert::TryFrom;
use err_derive::Error;
use hyper::StatusCode; use hyper::StatusCode;
use thiserror::Error;
use garage_util::error::Error as GarageError; use garage_util::error::Error as GarageError;
@ -12,48 +12,48 @@ use garage_model::helper::error::Error as HelperError;
pub enum CommonError { pub enum CommonError {
// ---- INTERNAL ERRORS ---- // ---- INTERNAL ERRORS ----
/// Error related to deeper parts of Garage /// Error related to deeper parts of Garage
#[error("Internal error: {0}")] #[error(display = "Internal error: {}", _0)]
InternalError(#[from] GarageError), InternalError(#[error(source)] GarageError),
/// Error related to Hyper /// Error related to Hyper
#[error("Internal error (Hyper error): {0}")] #[error(display = "Internal error (Hyper error): {}", _0)]
Hyper(#[from] hyper::Error), Hyper(#[error(source)] hyper::Error),
/// Error related to HTTP /// Error related to HTTP
#[error("Internal error (HTTP error): {0}")] #[error(display = "Internal error (HTTP error): {}", _0)]
Http(#[from] http::Error), Http(#[error(source)] http::Error),
// ---- GENERIC CLIENT ERRORS ---- // ---- GENERIC CLIENT ERRORS ----
/// Proper authentication was not provided /// Proper authentication was not provided
#[error("Forbidden: {0}")] #[error(display = "Forbidden: {}", _0)]
Forbidden(String), Forbidden(String),
/// Generic bad request response with custom message /// Generic bad request response with custom message
#[error("Bad request: {0}")] #[error(display = "Bad request: {}", _0)]
BadRequest(String), BadRequest(String),
/// The client sent a header with invalid value /// The client sent a header with invalid value
#[error("Invalid header value: {0}")] #[error(display = "Invalid header value: {}", _0)]
InvalidHeader(#[from] hyper::header::ToStrError), InvalidHeader(#[error(source)] hyper::header::ToStrError),
// ---- SPECIFIC ERROR CONDITIONS ---- // ---- SPECIFIC ERROR CONDITIONS ----
// These have to be error codes referenced in the S3 spec here: // These have to be error codes referenced in the S3 spec here:
// https://docs.aws.amazon.com/AmazonS3/latest/API/ErrorResponses.html#ErrorCodeList // https://docs.aws.amazon.com/AmazonS3/latest/API/ErrorResponses.html#ErrorCodeList
/// The bucket requested don't exists /// The bucket requested don't exists
#[error("Bucket not found: {0}")] #[error(display = "Bucket not found: {}", _0)]
NoSuchBucket(String), NoSuchBucket(String),
/// Tried to create a bucket that already exist /// Tried to create a bucket that already exist
#[error("Bucket already exists")] #[error(display = "Bucket already exists")]
BucketAlreadyExists, BucketAlreadyExists,
/// Tried to delete a non-empty bucket /// Tried to delete a non-empty bucket
#[error("Tried to delete a non-empty bucket")] #[error(display = "Tried to delete a non-empty bucket")]
BucketNotEmpty, BucketNotEmpty,
// Category: bad request // Category: bad request
/// Bucket name is not valid according to AWS S3 specs /// Bucket name is not valid according to AWS S3 specs
#[error("Invalid bucket name: {0}")] #[error(display = "Invalid bucket name: {}", _0)]
InvalidBucketName(String), InvalidBucketName(String),
} }

View file

@ -58,12 +58,6 @@ pub trait ApiHandler: Send + Sync + 'static {
req: Request<IncomingBody>, req: Request<IncomingBody>,
endpoint: Self::Endpoint, endpoint: Self::Endpoint,
) -> impl Future<Output = Result<Response<BoxBody<Self::Error>>, Self::Error>> + Send; ) -> impl Future<Output = Result<Response<BoxBody<Self::Error>>, Self::Error>> + Send;
/// Returns the key id used to authenticate this request. The ID returned must be safe to
/// log.
fn key_id_from_request(&self, _req: &Request<IncomingBody>) -> Option<String> {
None
}
} }
pub struct ApiServer<A: ApiHandler> { pub struct ApiServer<A: ApiHandler> {
@ -148,20 +142,19 @@ impl<A: ApiHandler> ApiServer<A> {
) -> Result<Response<BoxBody<A::Error>>, http::Error> { ) -> Result<Response<BoxBody<A::Error>>, http::Error> {
let uri = req.uri().clone(); let uri = req.uri().clone();
let source = if let Ok(forwarded_for_ip_addr) = if let Ok(forwarded_for_ip_addr) =
forwarded_headers::handle_forwarded_for_headers(req.headers()) forwarded_headers::handle_forwarded_for_headers(req.headers())
{ {
format!("{forwarded_for_ip_addr} (via {addr})") info!(
"{} (via {}) {} {}",
forwarded_for_ip_addr,
addr,
req.method(),
uri
);
} else { } else {
format!("{addr}") info!("{} {} {}", addr, req.method(), uri);
}; }
// we only do this to log the access key, so we can discard any error
let key = self
.api_handler
.key_id_from_request(&req)
.map(|k| format!("(key {k}) "))
.unwrap_or_default();
info!("{source} {key}{} {uri}", req.method());
debug!("{:?}", req); debug!("{:?}", req);
let tracer = opentelemetry::global::tracer("garage"); let tracer = opentelemetry::global::tracer("garage");
@ -350,11 +343,7 @@ where
while !*must_exit.borrow() { while !*must_exit.borrow() {
let (stream, client_addr) = tokio::select! { let (stream, client_addr) = tokio::select! {
acc = listener.accept() => match acc { acc = listener.accept() => acc?,
Ok(r) => r,
Err(e) if e.kind() == std::io::ErrorKind::ConnectionAborted => continue,
Err(e) => return Err(e.into()),
},
_ = must_exit.changed() => continue, _ = must_exit.changed() => continue,
}; };

View file

@ -8,6 +8,7 @@ use hyper::{
body::{Body, Bytes}, body::{Body, Bytes},
Request, Response, Request, Response,
}; };
use idna::domain_to_unicode;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use garage_model::bucket_table::BucketParams; use garage_model::bucket_table::BucketParams;
@ -96,7 +97,7 @@ pub fn authority_to_host(authority: &str) -> Result<String, Error> {
authority authority
))), ))),
}; };
authority.map(|h| h.to_ascii_lowercase()) authority.map(|h| domain_to_unicode(h).0)
} }
/// Extract the bucket name and the key name from an HTTP path and possibly a bucket provided in /// Extract the bucket name and the key name from an HTTP path and possibly a bucket provided in

View file

@ -1,4 +1,4 @@
use thiserror::Error; use err_derive::Error;
use crate::common_error::CommonError; use crate::common_error::CommonError;
pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInternalError}; pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInternalError};
@ -6,21 +6,21 @@ pub use crate::common_error::{CommonErrorDerivative, OkOrBadRequest, OkOrInterna
/// Errors of this crate /// Errors of this crate
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum Error { pub enum Error {
#[error("{0}")] #[error(display = "{}", _0)]
/// Error from common error /// Error from common error
Common(CommonError), Common(CommonError),
/// Authorization Header Malformed /// Authorization Header Malformed
#[error("Authorization header malformed, unexpected scope: {0}")] #[error(display = "Authorization header malformed, unexpected scope: {}", _0)]
AuthorizationHeaderMalformed(String), AuthorizationHeaderMalformed(String),
// Category: bad request // Category: bad request
/// The request contained an invalid UTF-8 sequence in its path or in other parameters /// The request contained an invalid UTF-8 sequence in its path or in other parameters
#[error("Invalid UTF-8: {0}")] #[error(display = "Invalid UTF-8: {}", _0)]
InvalidUtf8Str(#[from] std::str::Utf8Error), InvalidUtf8Str(#[error(source)] std::str::Utf8Error),
/// The provided digest (checksum) value was invalid /// The provided digest (checksum) value was invalid
#[error("Invalid digest: {0}")] #[error(display = "Invalid digest: {}", _0)]
InvalidDigest(String), InvalidDigest(String),
} }

View file

@ -104,7 +104,7 @@ async fn check_standard_signature(
// Verify that all necessary request headers are included in signed_headers // Verify that all necessary request headers are included in signed_headers
// The following must be included for all signatures: // The following must be included for all signatures:
// - the Host header (mandatory) // - the Host header (mandatory)
// - all x-amz-* headers used in the request (except x-amz-content-sha256) // - all x-amz-* headers used in the request
// AWS also indicates that the Content-Type header should be signed if // AWS also indicates that the Content-Type header should be signed if
// it is used, but Minio client doesn't sign it so we don't check it for compatibility. // it is used, but Minio client doesn't sign it so we don't check it for compatibility.
let signed_headers = split_signed_headers(&authorization)?; let signed_headers = split_signed_headers(&authorization)?;
@ -151,7 +151,7 @@ async fn check_presigned_signature(
// Verify that all necessary request headers are included in signed_headers // Verify that all necessary request headers are included in signed_headers
// For AWSv4 pre-signed URLs, the following must be included: // For AWSv4 pre-signed URLs, the following must be included:
// - the Host header (mandatory) // - the Host header (mandatory)
// - all x-amz-* headers used in the request (except x-amz-content-sha256) // - all x-amz-* headers used in the request
let signed_headers = split_signed_headers(&authorization)?; let signed_headers = split_signed_headers(&authorization)?;
verify_signed_headers(request.headers(), &signed_headers)?; verify_signed_headers(request.headers(), &signed_headers)?;
@ -268,9 +268,7 @@ fn verify_signed_headers(headers: &HeaderMap, signed_headers: &[HeaderName]) ->
return Err(Error::bad_request("Header `Host` should be signed")); return Err(Error::bad_request("Header `Host` should be signed"));
} }
for (name, _) in headers.iter() { for (name, _) in headers.iter() {
// Enforce signature of all x-amz-* headers, except x-amz-content-sh256 if name.as_str().starts_with("x-amz-") {
// because it is included in the canonical request in all cases
if name.as_str().starts_with("x-amz-") && name != X_AMZ_CONTENT_SHA256 {
if !signed_headers.contains(name) { if !signed_headers.contains(name) {
return Err(Error::bad_request(format!( return Err(Error::bad_request(format!(
"Header `{}` should be signed", "Header `{}` should be signed",
@ -419,7 +417,7 @@ pub async fn verify_v4(
// ============ Authorization header, or X-Amz-* query params ========= // ============ Authorization header, or X-Amz-* query params =========
pub struct Authorization { pub struct Authorization {
pub key_id: String, key_id: String,
scope: String, scope: String,
signed_headers: String, signed_headers: String,
signature: String, signature: String,
@ -428,7 +426,7 @@ pub struct Authorization {
} }
impl Authorization { impl Authorization {
pub fn parse_header(headers: &HeaderMap) -> Result<Self, Error> { fn parse_header(headers: &HeaderMap) -> Result<Self, Error> {
let authorization = headers let authorization = headers
.get(AUTHORIZATION) .get(AUTHORIZATION)
.ok_or_bad_request("Missing authorization header")? .ok_or_bad_request("Missing authorization header")?
@ -470,7 +468,8 @@ impl Authorization {
let date = headers let date = headers
.get(X_AMZ_DATE) .get(X_AMZ_DATE)
.ok_or_bad_request("Missing X-Amz-Date field")? .ok_or_bad_request("Missing X-Amz-Date field")
.map_err(Error::from)?
.to_str()?; .to_str()?;
let date = parse_date(date)?; let date = parse_date(date)?;

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_api_k2v" name = "garage_api_k2v"
version = "1.3.1" version = "1.1.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"
@ -20,7 +20,7 @@ garage_util = { workspace = true, features = [ "k2v" ] }
garage_api_common.workspace = true garage_api_common.workspace = true
base64.workspace = true base64.workspace = true
thiserror.workspace = true err-derive.workspace = true
tracing.workspace = true tracing.workspace = true
futures.workspace = true futures.workspace = true

View file

@ -176,12 +176,6 @@ impl ApiHandler for K2VApiServer {
Ok(resp_ok) Ok(resp_ok)
} }
fn key_id_from_request(&self, req: &Request<IncomingBody>) -> Option<String> {
garage_api_common::signature::payload::Authorization::parse_header(req.headers())
.map(|auth| auth.key_id)
.ok()
}
} }
impl ApiEndpoint for K2VApiEndpoint { impl ApiEndpoint for K2VApiEndpoint {

View file

@ -1,6 +1,6 @@
use err_derive::Error;
use hyper::header::HeaderValue; use hyper::header::HeaderValue;
use hyper::{HeaderMap, StatusCode}; use hyper::{HeaderMap, StatusCode};
use thiserror::Error;
use garage_api_common::common_error::{commonErrorDerivative, CommonError}; use garage_api_common::common_error::{commonErrorDerivative, CommonError};
pub(crate) use garage_api_common::common_error::{helper_error_as_internal, pass_helper_error}; pub(crate) use garage_api_common::common_error::{helper_error_as_internal, pass_helper_error};
@ -14,38 +14,38 @@ use garage_api_common::signature::error::Error as SignatureError;
/// Errors of this crate /// Errors of this crate
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum Error { pub enum Error {
#[error("{0}")] #[error(display = "{}", _0)]
/// Error from common error /// Error from common error
Common(#[from] CommonError), Common(#[error(source)] CommonError),
// Category: cannot process // Category: cannot process
/// Authorization Header Malformed /// Authorization Header Malformed
#[error("Authorization header malformed, unexpected scope: {0}")] #[error(display = "Authorization header malformed, unexpected scope: {}", _0)]
AuthorizationHeaderMalformed(String), AuthorizationHeaderMalformed(String),
/// The provided digest (checksum) value was invalid /// The provided digest (checksum) value was invalid
#[error("Invalid digest: {0}")] #[error(display = "Invalid digest: {}", _0)]
InvalidDigest(String), InvalidDigest(String),
/// The object requested don't exists /// The object requested don't exists
#[error("Key not found")] #[error(display = "Key not found")]
NoSuchKey, NoSuchKey,
/// Some base64 encoded data was badly encoded /// Some base64 encoded data was badly encoded
#[error("Invalid base64: {0}")] #[error(display = "Invalid base64: {}", _0)]
InvalidBase64(#[from] base64::DecodeError), InvalidBase64(#[error(source)] base64::DecodeError),
/// Invalid causality token /// Invalid causality token
#[error("Invalid causality token")] #[error(display = "Invalid causality token")]
InvalidCausalityToken, InvalidCausalityToken,
/// The client asked for an invalid return format (invalid Accept header) /// The client asked for an invalid return format (invalid Accept header)
#[error("Not acceptable: {0}")] #[error(display = "Not acceptable: {}", _0)]
NotAcceptable(String), NotAcceptable(String),
/// The request contained an invalid UTF-8 sequence in its path or in other parameters /// The request contained an invalid UTF-8 sequence in its path or in other parameters
#[error("Invalid UTF-8: {0}")] #[error(display = "Invalid UTF-8: {}", _0)]
InvalidUtf8Str(#[from] std::str::Utf8Error), InvalidUtf8Str(#[error(source)] std::str::Utf8Error),
} }
commonErrorDerivative!(Error); commonErrorDerivative!(Error);

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_api_s3" name = "garage_api_s3"
version = "1.3.1" version = "1.1.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"
@ -29,7 +29,7 @@ bytes.workspace = true
chrono.workspace = true chrono.workspace = true
crc32fast.workspace = true crc32fast.workspace = true
crc32c.workspace = true crc32c.workspace = true
thiserror.workspace = true err-derive.workspace = true
hex.workspace = true hex.workspace = true
tracing.workspace = true tracing.workspace = true
md-5.workspace = true md-5.workspace = true

View file

@ -226,7 +226,6 @@ impl ApiHandler for S3ApiServer {
Endpoint::DeleteBucket {} => handle_delete_bucket(ctx).await, Endpoint::DeleteBucket {} => handle_delete_bucket(ctx).await,
Endpoint::GetBucketLocation {} => handle_get_bucket_location(ctx), Endpoint::GetBucketLocation {} => handle_get_bucket_location(ctx),
Endpoint::GetBucketVersioning {} => handle_get_bucket_versioning(), Endpoint::GetBucketVersioning {} => handle_get_bucket_versioning(),
Endpoint::GetBucketAcl {} => handle_get_bucket_acl(ctx),
Endpoint::ListObjects { Endpoint::ListObjects {
delimiter, delimiter,
encoding_type, encoding_type,
@ -343,12 +342,6 @@ impl ApiHandler for S3ApiServer {
Ok(resp_ok) Ok(resp_ok)
} }
fn key_id_from_request(&self, req: &Request<IncomingBody>) -> Option<String> {
garage_api_common::signature::payload::Authorization::parse_header(req.headers())
.map(|auth| auth.key_id)
.ok()
}
} }
impl ApiEndpoint for S3ApiEndpoint { impl ApiEndpoint for S3ApiEndpoint {

View file

@ -5,7 +5,7 @@ use hyper::{Request, Response, StatusCode};
use garage_model::bucket_alias_table::*; use garage_model::bucket_alias_table::*;
use garage_model::bucket_table::Bucket; use garage_model::bucket_table::Bucket;
use garage_model::garage::Garage; use garage_model::garage::Garage;
use garage_model::key_table::{Key, KeyParams}; use garage_model::key_table::Key;
use garage_model::permission::BucketKeyPerm; use garage_model::permission::BucketKeyPerm;
use garage_table::util::*; use garage_table::util::*;
use garage_util::crdt::*; use garage_util::crdt::*;
@ -44,55 +44,6 @@ pub fn handle_get_bucket_versioning() -> Result<Response<ResBody>, Error> {
.body(string_body(xml))?) .body(string_body(xml))?)
} }
pub fn handle_get_bucket_acl(ctx: ReqCtx) -> Result<Response<ResBody>, Error> {
let ReqCtx {
bucket_id, api_key, ..
} = ctx;
let key_p = api_key.params().ok_or_internal_error(
"Key should not be in deleted state at this point (in handle_get_bucket_acl)",
)?;
let mut grants: Vec<s3_xml::Grant> = vec![];
let kp = api_key.bucket_permissions(&bucket_id);
if kp.allow_owner {
grants.push(s3_xml::Grant {
grantee: create_grantee(&key_p, &api_key),
permission: s3_xml::Value("FULL_CONTROL".to_string()),
});
} else {
if kp.allow_read {
grants.push(s3_xml::Grant {
grantee: create_grantee(&key_p, &api_key),
permission: s3_xml::Value("READ".to_string()),
});
grants.push(s3_xml::Grant {
grantee: create_grantee(&key_p, &api_key),
permission: s3_xml::Value("READ_ACP".to_string()),
});
}
if kp.allow_write {
grants.push(s3_xml::Grant {
grantee: create_grantee(&key_p, &api_key),
permission: s3_xml::Value("WRITE".to_string()),
});
}
}
let access_control_policy = s3_xml::AccessControlPolicy {
xmlns: (),
owner: None,
acl: s3_xml::AccessControlList { entries: grants },
};
let xml = s3_xml::to_xml_with_header(&access_control_policy)?;
trace!("xml: {}", xml);
Ok(Response::builder()
.header("Content-Type", "application/xml")
.body(string_body(xml))?)
}
pub async fn handle_list_buckets( pub async fn handle_list_buckets(
garage: &Garage, garage: &Garage,
api_key: &Key, api_key: &Key,
@ -221,7 +172,7 @@ pub async fn handle_create_bucket(
} }
// Create the bucket! // Create the bucket!
if !is_valid_bucket_name(&bucket_name, garage.config.allow_punycode) { if !is_valid_bucket_name(&bucket_name) {
return Err(Error::bad_request(format!( return Err(Error::bad_request(format!(
"{}: {}", "{}: {}",
bucket_name, INVALID_BUCKET_NAME_MESSAGE bucket_name, INVALID_BUCKET_NAME_MESSAGE
@ -290,11 +241,11 @@ pub async fn handle_delete_bucket(ctx: ReqCtx) -> Result<Response<ResBody>, Erro
// 1. delete bucket alias // 1. delete bucket alias
if is_local_alias { if is_local_alias {
helper helper
.purge_local_bucket_alias(*bucket_id, &api_key.key_id, bucket_name) .unset_local_bucket_alias(*bucket_id, &api_key.key_id, bucket_name)
.await?; .await?;
} else { } else {
helper helper
.purge_global_bucket_alias(*bucket_id, bucket_name) .unset_global_bucket_alias(*bucket_id, bucket_name)
.await?; .await?;
} }
@ -360,15 +311,6 @@ fn parse_create_bucket_xml(xml_bytes: &[u8]) -> Option<Option<String>> {
Some(ret) Some(ret)
} }
fn create_grantee(key_params: &KeyParams, api_key: &Key) -> s3_xml::Grantee {
s3_xml::Grantee {
xmlns_xsi: (),
typ: "CanonicalUser".to_string(),
display_name: Some(s3_xml::Value(key_params.name.get().to_string())),
id: Some(s3_xml::Value(api_key.key_id.to_string())),
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;

View file

@ -26,10 +26,9 @@ use garage_api_common::signature::checksum::*;
use crate::api_server::{ReqBody, ResBody}; use crate::api_server::{ReqBody, ResBody};
use crate::encryption::EncryptionParams; use crate::encryption::EncryptionParams;
use crate::error::*; use crate::error::*;
use crate::get::{check_version_not_deleted, full_object_byte_stream, PreconditionHeaders}; use crate::get::{full_object_byte_stream, PreconditionHeaders};
use crate::multipart; use crate::multipart;
use crate::put::{extract_metadata_headers, save_stream, ChecksumMode, SaveStreamResult}; use crate::put::{extract_metadata_headers, save_stream, ChecksumMode, SaveStreamResult};
use crate::website::X_AMZ_WEBSITE_REDIRECT_LOCATION;
use crate::xml::{self as s3_xml, xmlns_tag}; use crate::xml::{self as s3_xml, xmlns_tag};
pub const X_AMZ_COPY_SOURCE_IF_MATCH: HeaderName = pub const X_AMZ_COPY_SOURCE_IF_MATCH: HeaderName =
@ -85,18 +84,7 @@ pub async fn handle_copy(
Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => { Some(v) if v == hyper::header::HeaderValue::from_static("REPLACE") => {
extract_metadata_headers(req.headers())? extract_metadata_headers(req.headers())?
} }
_ => { _ => source_object_meta_inner.into_owned().headers,
// The x-amz-website-redirect-location header is not copied, instead
// it is replaced by the value from the request (or removed if no
// value was specified)
let is_redirect =
|(key, _): &(String, String)| key == X_AMZ_WEBSITE_REDIRECT_LOCATION.as_str();
let mut headers: Vec<_> = source_object_meta_inner.headers.clone();
headers.retain(|h| !is_redirect(h));
let new_headers = extract_metadata_headers(req.headers())?;
headers.extend(new_headers.into_iter().filter(is_redirect));
headers
}
}, },
checksum: source_checksum, checksum: source_checksum,
}; };
@ -237,7 +225,6 @@ async fn handle_copy_metaonly(
.get(&source_version.uuid, &EmptyKey) .get(&source_version.uuid, &EmptyKey)
.await?; .await?;
let source_version = source_version.ok_or(Error::NoSuchKey)?; let source_version = source_version.ok_or(Error::NoSuchKey)?;
check_version_not_deleted(&source_version)?;
// Write an "uploading" marker in Object table // Write an "uploading" marker in Object table
// This holds a reference to the object in the Version table // This holds a reference to the object in the Version table
@ -429,7 +416,6 @@ pub async fn handle_upload_part_copy(
.get(&source_object_version.uuid, &EmptyKey) .get(&source_object_version.uuid, &EmptyKey)
.await? .await?
.ok_or(Error::NoSuchKey)?; .ok_or(Error::NoSuchKey)?;
check_version_not_deleted(&source_version)?;
// We want to reuse blocks from the source version as much as possible. // We want to reuse blocks from the source version as much as possible.
// However, we still need to get the data from these blocks // However, we still need to get the data from these blocks
@ -561,7 +547,6 @@ pub async fn handle_upload_part_copy(
let mut current_offset = 0; let mut current_offset = 0;
let mut next_block = defragmenter.next().await?; let mut next_block = defragmenter.next().await?;
let mut blocks_to_dup = dest_version.clone();
// TODO this could be optimized similarly to read_and_put_blocks // TODO this could be optimized similarly to read_and_put_blocks
// low priority because uploadpartcopy is rarely used // low priority because uploadpartcopy is rarely used
@ -591,7 +576,8 @@ pub async fn handle_upload_part_copy(
.unwrap()?; .unwrap()?;
checksummer = checksummer_updated; checksummer = checksummer_updated;
let (version_block_key, version_block) = ( dest_version.blocks.clear();
dest_version.blocks.put(
VersionBlockKey { VersionBlockKey {
part_number, part_number,
offset: current_offset, offset: current_offset,
@ -603,56 +589,37 @@ pub async fn handle_upload_part_copy(
); );
current_offset += data_len; current_offset += data_len;
let next = if let Some(final_data) = data_to_upload { let block_ref = BlockRef {
dest_version.blocks.clear(); block: final_hash,
dest_version.blocks.put(version_block_key, version_block); version: dest_version_id,
let block_ref = BlockRef { deleted: false.into(),
block: final_hash,
version: dest_version_id,
deleted: false.into(),
};
let (_, _, _, next) = futures::try_join!(
// Thing 1: if the block is not exactly a block that existed before,
// we need to insert that data as a new block.
garage.block_manager.rpc_put_block(
final_hash,
final_data,
dest_encryption.is_encrypted(),
None
),
// Thing 2: we need to insert the block in the version
garage.version_table.insert(&dest_version),
// Thing 3: we need to add a block reference
garage.block_ref_table.insert(&block_ref),
// Thing 4: we need to read the next block
defragmenter.next(),
)?;
next
} else {
blocks_to_dup.blocks.put(version_block_key, version_block);
defragmenter.next().await?
}; };
let (_, _, _, next) = futures::try_join!(
// Thing 1: if the block is not exactly a block that existed before,
// we need to insert that data as a new block.
async {
if let Some(final_data) = data_to_upload {
garage
.block_manager
.rpc_put_block(final_hash, final_data, dest_encryption.is_encrypted(), None)
.await
} else {
Ok(())
}
},
// Thing 2: we need to insert the block in the version
garage.version_table.insert(&dest_version),
// Thing 3: we need to add a block reference
garage.block_ref_table.insert(&block_ref),
// Thing 4: we need to read the next block
defragmenter.next(),
)?;
next_block = next; next_block = next;
} }
assert_eq!(current_offset, source_range.length); assert_eq!(current_offset, source_range.length);
// Put the duplicated blocks into the version & block_refs tables
let block_refs_to_put = blocks_to_dup
.blocks
.items()
.iter()
.map(|b| BlockRef {
block: b.1.hash,
version: dest_version_id,
deleted: false.into(),
})
.collect::<Vec<_>>();
futures::try_join!(
garage.version_table.insert(&blocks_to_dup),
garage.block_ref_table.insert_many(&block_refs_to_put[..]),
)?;
let checksums = checksummer.finalize(); let checksums = checksummer.finalize();
let etag = dest_encryption.etag_from_md5(&checksums.md5); let etag = dest_encryption.etag_from_md5(&checksums.md5);
let checksum = checksums.extract(dest_object_checksum_algorithm); let checksum = checksums.extract(dest_object_checksum_algorithm);

View file

@ -88,9 +88,7 @@ pub async fn handle_put_cors(
pub struct CorsConfiguration { pub struct CorsConfiguration {
#[serde(serialize_with = "xmlns_tag", skip_deserializing)] #[serde(serialize_with = "xmlns_tag", skip_deserializing)]
pub xmlns: (), pub xmlns: (),
// "default" is required to be able to parse an empty list of rules, #[serde(rename = "CORSRule")]
// cf https://docs.rs/quick-xml/latest/quick_xml/de/#sequences-xsall-and-xssequence-xml-schema-types
#[serde(rename = "CORSRule", default)]
pub cors_rules: Vec<CorsRule>, pub cors_rules: Vec<CorsRule>,
} }
@ -272,26 +270,4 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn test_deserialize_norules() -> Result<(), Error> {
let message = r#"<?xml version="1.0" encoding="UTF-8"?>
<CORSConfiguration xmlns="http://s3.amazonaws.com/doc/2006-03-01/" />"#;
let conf: CorsConfiguration = from_str(message).unwrap();
let ref_value = CorsConfiguration {
xmlns: (),
cors_rules: vec![],
};
assert_eq! {
ref_value,
conf
};
let message2 = to_xml_with_header(&ref_value)?;
let cleanup = |c: &str| c.replace(char::is_whitespace, "");
assert_eq!(cleanup(message), cleanup(&message2));
Ok(())
}
} }

View file

@ -1,8 +1,8 @@
use std::convert::TryInto; use std::convert::TryInto;
use err_derive::Error;
use hyper::header::HeaderValue; use hyper::header::HeaderValue;
use hyper::{HeaderMap, StatusCode}; use hyper::{HeaderMap, StatusCode};
use thiserror::Error;
use garage_model::helper::error::Error as HelperError; use garage_model::helper::error::Error as HelperError;
@ -25,67 +25,67 @@ use crate::xml as s3_xml;
/// Errors of this crate /// Errors of this crate
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum Error { pub enum Error {
#[error("{0}")] #[error(display = "{}", _0)]
/// Error from common error /// Error from common error
Common(#[from] CommonError), Common(#[error(source)] CommonError),
// Category: cannot process // Category: cannot process
/// Authorization Header Malformed /// Authorization Header Malformed
#[error("Authorization header malformed, unexpected scope: {0}")] #[error(display = "Authorization header malformed, unexpected scope: {}", _0)]
AuthorizationHeaderMalformed(String), AuthorizationHeaderMalformed(String),
/// The object requested don't exists /// The object requested don't exists
#[error("Key not found")] #[error(display = "Key not found")]
NoSuchKey, NoSuchKey,
/// The multipart upload requested don't exists /// The multipart upload requested don't exists
#[error("Upload not found")] #[error(display = "Upload not found")]
NoSuchUpload, NoSuchUpload,
/// Precondition failed (e.g. x-amz-copy-source-if-match) /// Precondition failed (e.g. x-amz-copy-source-if-match)
#[error("At least one of the preconditions you specified did not hold")] #[error(display = "At least one of the preconditions you specified did not hold")]
PreconditionFailed, PreconditionFailed,
/// Parts specified in CMU request do not match parts actually uploaded /// Parts specified in CMU request do not match parts actually uploaded
#[error("Parts given to CompleteMultipartUpload do not match uploaded parts")] #[error(display = "Parts given to CompleteMultipartUpload do not match uploaded parts")]
InvalidPart, InvalidPart,
/// Parts given to CompleteMultipartUpload were not in ascending order /// Parts given to CompleteMultipartUpload were not in ascending order
#[error("Parts given to CompleteMultipartUpload were not in ascending order")] #[error(display = "Parts given to CompleteMultipartUpload were not in ascending order")]
InvalidPartOrder, InvalidPartOrder,
/// In CompleteMultipartUpload: not enough data /// In CompleteMultipartUpload: not enough data
/// (here we are more lenient than AWS S3) /// (here we are more lenient than AWS S3)
#[error("Proposed upload is smaller than the minimum allowed object size")] #[error(display = "Proposed upload is smaller than the minimum allowed object size")]
EntityTooSmall, EntityTooSmall,
// Category: bad request // Category: bad request
/// The request contained an invalid UTF-8 sequence in its path or in other parameters /// The request contained an invalid UTF-8 sequence in its path or in other parameters
#[error("Invalid UTF-8: {0}")] #[error(display = "Invalid UTF-8: {}", _0)]
InvalidUtf8Str(#[from] std::str::Utf8Error), InvalidUtf8Str(#[error(source)] std::str::Utf8Error),
/// The request used an invalid path /// The request used an invalid path
#[error("Invalid UTF-8: {0}")] #[error(display = "Invalid UTF-8: {}", _0)]
InvalidUtf8String(#[from] std::string::FromUtf8Error), InvalidUtf8String(#[error(source)] std::string::FromUtf8Error),
/// The client sent invalid XML data /// The client sent invalid XML data
#[error("Invalid XML: {0}")] #[error(display = "Invalid XML: {}", _0)]
InvalidXml(String), InvalidXml(String),
/// The client sent a range header with invalid value /// The client sent a range header with invalid value
#[error("Invalid HTTP range: {0:?}")] #[error(display = "Invalid HTTP range: {:?}", _0)]
InvalidRange((http_range::HttpRangeParseError, u64)), InvalidRange(#[error(from)] (http_range::HttpRangeParseError, u64)),
/// The client sent a range header with invalid value /// The client sent a range header with invalid value
#[error("Invalid encryption algorithm: {0:?}, should be AES256")] #[error(display = "Invalid encryption algorithm: {:?}, should be AES256", _0)]
InvalidEncryptionAlgorithm(String), InvalidEncryptionAlgorithm(String),
/// The provided digest (checksum) value was invalid /// The provided digest (checksum) value was invalid
#[error("Invalid digest: {0}")] #[error(display = "Invalid digest: {}", _0)]
InvalidDigest(String), InvalidDigest(String),
/// The client sent a request for an action not supported by garage /// The client sent a request for an action not supported by garage
#[error("Unimplemented action: {0}")] #[error(display = "Unimplemented action: {}", _0)]
NotImplemented(String), NotImplemented(String),
} }
@ -99,12 +99,6 @@ impl From<HelperError> for Error {
} }
} }
impl From<(http_range::HttpRangeParseError, u64)> for Error {
fn from(err: (http_range::HttpRangeParseError, u64)) -> Error {
Error::InvalidRange(err)
}
}
impl From<roxmltree::Error> for Error { impl From<roxmltree::Error> for Error {
fn from(err: roxmltree::Error) -> Self { fn from(err: roxmltree::Error) -> Self {
Self::InvalidXml(format!("{}", err)) Self::InvalidXml(format!("{}", err))

View file

@ -19,13 +19,12 @@ use garage_net::stream::ByteStream;
use garage_rpc::rpc_helper::OrderTag; use garage_rpc::rpc_helper::OrderTag;
use garage_table::EmptyKey; use garage_table::EmptyKey;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::{Error as UtilError, OkOrMessage}; use garage_util::error::OkOrMessage;
use garage_model::garage::Garage; use garage_model::garage::Garage;
use garage_model::s3::object_table::*; use garage_model::s3::object_table::*;
use garage_model::s3::version_table::*; use garage_model::s3::version_table::*;
use garage_api_common::common_error::CommonError;
use garage_api_common::helpers::*; use garage_api_common::helpers::*;
use garage_api_common::signature::checksum::{add_checksum_response_headers, X_AMZ_CHECKSUM_MODE}; use garage_api_common::signature::checksum::{add_checksum_response_headers, X_AMZ_CHECKSUM_MODE};
@ -216,7 +215,6 @@ pub async fn handle_head_without_ctx(
.get(&object_version.uuid, &EmptyKey) .get(&object_version.uuid, &EmptyKey)
.await? .await?
.ok_or(Error::NoSuchKey)?; .ok_or(Error::NoSuchKey)?;
check_version_not_deleted(&version)?;
let (part_offset, part_end) = let (part_offset, part_end) =
calculate_part_bounds(&version, pn).ok_or(Error::InvalidPart)?; calculate_part_bounds(&version, pn).ok_or(Error::InvalidPart)?;
@ -367,21 +365,6 @@ pub async fn handle_get_without_ctx(
} }
} }
pub(crate) fn check_version_not_deleted(version: &Version) -> Result<(), Error> {
if version.deleted.get() {
// the version was deleted between when the object_table was consulted
// and now, this could mean the object was deleted, or overriden.
// Rather than say the key doesn't exist, return a transient error
// to signal the client to try again.
return Err(CommonError::InternalError(UtilError::Message(
"conflict/inconsistency between object and version state, version is deleted"
.to_string(),
))
.into());
}
Ok(())
}
async fn handle_get_full( async fn handle_get_full(
garage: Arc<Garage>, garage: Arc<Garage>,
version: &ObjectVersion, version: &ObjectVersion,
@ -448,7 +431,6 @@ pub fn full_object_byte_stream(
.ok_or_message("channel closed")?; .ok_or_message("channel closed")?;
let version = version_fut.await.unwrap()?.ok_or(Error::NoSuchKey)?; let version = version_fut.await.unwrap()?.ok_or(Error::NoSuchKey)?;
check_version_not_deleted(&version)?;
for (i, (_, vb)) in version.blocks.items().iter().enumerate().skip(1) { for (i, (_, vb)) in version.blocks.items().iter().enumerate().skip(1) {
let stream_block_i = encryption let stream_block_i = encryption
.get_block(&garage, &vb.hash, Some(order_stream.order(i as u64))) .get_block(&garage, &vb.hash, Some(order_stream.order(i as u64)))
@ -464,14 +446,6 @@ pub fn full_object_byte_stream(
{ {
Ok(()) => (), Ok(()) => (),
Err(e) => { Err(e) => {
// TODO i think this is a bad idea, we should log
// an error and stop there. If the error happens to
// be exactly the size of what hasn't been streamed
// yet, the client will see the request as a
// success
// instead truncating the output notify the client
// something happened with their download, so that
// they can retry it
let _ = tx.send(error_stream_item(e)).await; let _ = tx.send(error_stream_item(e)).await;
} }
} }
@ -523,7 +497,7 @@ async fn handle_get_range(
.get(&version.uuid, &EmptyKey) .get(&version.uuid, &EmptyKey)
.await? .await?
.ok_or(Error::NoSuchKey)?; .ok_or(Error::NoSuchKey)?;
check_version_not_deleted(&version)?;
let body = let body =
body_from_blocks_range(garage, encryption, version.blocks.items(), begin, end); body_from_blocks_range(garage, encryption, version.blocks.items(), begin, end);
Ok(resp_builder.body(body)?) Ok(resp_builder.body(body)?)
@ -574,8 +548,6 @@ async fn handle_get_part(
.await? .await?
.ok_or(Error::NoSuchKey)?; .ok_or(Error::NoSuchKey)?;
check_version_not_deleted(&version)?;
let (begin, end) = let (begin, end) =
calculate_part_bounds(&version, part_number).ok_or(Error::InvalidPart)?; calculate_part_bounds(&version, part_number).ok_or(Error::InvalidPart)?;
@ -845,9 +817,7 @@ impl PreconditionHeaders {
} }
fn check(&self, v: &ObjectVersion, etag: &str) -> Result<Option<StatusCode>, Error> { fn check(&self, v: &ObjectVersion, etag: &str) -> Result<Option<StatusCode>, Error> {
// we store date with ms precision, but headers are precise to the second: truncate let v_date = UNIX_EPOCH + Duration::from_millis(v.timestamp);
// the timestamp to handle the same-second edge case
let v_date = UNIX_EPOCH + Duration::from_secs(v.timestamp / 1000);
// Implemented from https://datatracker.ietf.org/doc/html/rfc7232#section-6 // Implemented from https://datatracker.ietf.org/doc/html/rfc7232#section-6

View file

@ -27,7 +27,7 @@ pub async fn handle_get_lifecycle(ctx: ReqCtx) -> Result<Response<ResBody>, Erro
.body(string_body(xml))?) .body(string_body(xml))?)
} else { } else {
Ok(Response::builder() Ok(Response::builder()
.status(StatusCode::NOT_FOUND) .status(StatusCode::NO_CONTENT)
.body(empty_body())?) .body(empty_body())?)
} }
} }

View file

@ -141,26 +141,10 @@ pub async fn handle_post_object(
let mut conditions = decoded_policy.into_conditions()?; let mut conditions = decoded_policy.into_conditions()?;
// If there are conditions on the bucket name, check these against the actual bucket_name rather
// than the one in params, which is allowed to be absent.
if let Some(conds) = conditions.params.remove("bucket") {
for cond in conds {
let ok = match cond {
Operation::Equal(s) => s.as_str() == bucket_name,
Operation::StartsWith(s) => bucket_name.starts_with(&s),
};
if !ok {
return Err(Error::bad_request(
"Key 'bucket' has value not allowed in policy",
));
}
}
}
for (param_key, value) in params.iter() { for (param_key, value) in params.iter() {
let param_key = param_key.as_str(); let param_key = param_key.as_str();
match param_key { match param_key {
"policy" | "x-amz-signature" | "bucket" => (), // this is always accepted, as it's required to validate other fields "policy" | "x-amz-signature" => (), // this is always accepted, as it's required to validate other fields
"content-type" => { "content-type" => {
let conds = conditions.params.remove("content-type").ok_or_else(|| { let conds = conditions.params.remove("content-type").ok_or_else(|| {
Error::bad_request(format!("Key '{}' is not allowed in policy", param_key)) Error::bad_request(format!("Key '{}' is not allowed in policy", param_key))

View file

@ -39,6 +39,8 @@ use crate::encryption::EncryptionParams;
use crate::error::*; use crate::error::*;
use crate::website::X_AMZ_WEBSITE_REDIRECT_LOCATION; use crate::website::X_AMZ_WEBSITE_REDIRECT_LOCATION;
const PUT_BLOCKS_MAX_PARALLEL: usize = 3;
pub(crate) struct SaveStreamResult { pub(crate) struct SaveStreamResult {
pub(crate) version_uuid: Uuid, pub(crate) version_uuid: Uuid,
pub(crate) version_timestamp: u64, pub(crate) version_timestamp: u64,
@ -491,7 +493,7 @@ pub(crate) async fn read_and_put_blocks<S: Stream<Item = Result<Bytes, Error>> +
}; };
let recv_next = async { let recv_next = async {
// If more than a maximum number of writes are in progress, don't add more for now // If more than a maximum number of writes are in progress, don't add more for now
if currently_running >= ctx.garage.config.block_max_concurrent_writes_per_request { if currently_running >= PUT_BLOCKS_MAX_PARALLEL {
futures::future::pending().await futures::future::pending().await
} else { } else {
block_rx3.recv().await block_rx3.recv().await

View file

@ -13,10 +13,6 @@ pub fn xmlns_tag<S: Serializer>(_v: &(), s: S) -> Result<S::Ok, S::Error> {
s.serialize_str("http://s3.amazonaws.com/doc/2006-03-01/") s.serialize_str("http://s3.amazonaws.com/doc/2006-03-01/")
} }
pub fn xmlns_xsi_tag<S: Serializer>(_v: &(), s: S) -> Result<S::Ok, S::Error> {
s.serialize_str("http://www.w3.org/2001/XMLSchema-instance")
}
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)] #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
pub struct Value(#[serde(rename = "$value")] pub String); pub struct Value(#[serde(rename = "$value")] pub String);
@ -323,42 +319,6 @@ pub struct PostObject {
pub etag: Value, pub etag: Value,
} }
#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct Grantee {
#[serde(rename = "xmlns:xsi", serialize_with = "xmlns_xsi_tag")]
pub xmlns_xsi: (),
#[serde(rename = "xsi:type")]
pub typ: String,
#[serde(rename = "DisplayName")]
pub display_name: Option<Value>,
#[serde(rename = "ID")]
pub id: Option<Value>,
}
#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct Grant {
#[serde(rename = "Grantee")]
pub grantee: Grantee,
#[serde(rename = "Permission")]
pub permission: Value,
}
#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct AccessControlList {
#[serde(rename = "Grant")]
pub entries: Vec<Grant>,
}
#[derive(Debug, Serialize, PartialEq, Eq)]
pub struct AccessControlPolicy {
#[serde(serialize_with = "xmlns_tag")]
pub xmlns: (),
#[serde(rename = "Owner")]
pub owner: Option<Owner>,
#[serde(rename = "AccessControlList")]
pub acl: AccessControlList,
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@ -467,43 +427,6 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn get_bucket_acl_result() -> Result<(), ApiError> {
let grant = Grant {
grantee: Grantee {
xmlns_xsi: (),
typ: "CanonicalUser".to_string(),
display_name: Some(Value("owner_name".to_string())),
id: Some(Value("qsdfjklm".to_string())),
},
permission: Value("FULL_CONTROL".to_string()),
};
let get_bucket_acl = AccessControlPolicy {
xmlns: (),
owner: None,
acl: AccessControlList {
entries: vec![grant],
},
};
assert_eq!(
to_xml_with_header(&get_bucket_acl)?,
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\
<AccessControlPolicy xmlns=\"http://s3.amazonaws.com/doc/2006-03-01/\">\
<AccessControlList>\
<Grant>\
<Grantee xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:type=\"CanonicalUser\">\
<DisplayName>owner_name</DisplayName>\
<ID>qsdfjklm</ID>\
</Grantee>\
<Permission>FULL_CONTROL</Permission>\
</Grant>\
</AccessControlList>\
</AccessControlPolicy>"
);
Ok(())
}
#[test] #[test]
fn delete_result() -> Result<(), ApiError> { fn delete_result() -> Result<(), ApiError> {
let delete_result = DeleteResult { let delete_result = DeleteResult {

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_block" name = "garage_block"
version = "1.3.1" version = "1.1.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"

View file

@ -17,6 +17,7 @@ use opentelemetry::{
Context, Context,
}; };
use garage_net::endpoint::RpcInFlightLimiter;
use garage_net::stream::{read_stream_to_end, stream_asyncread, ByteStream}; use garage_net::stream::{read_stream_to_end, stream_asyncread, ByteStream};
use garage_db as db; use garage_db as db;
@ -50,8 +51,6 @@ pub const INLINE_THRESHOLD: usize = 3072;
// to delete the block locally. // to delete the block locally.
pub(crate) const BLOCK_GC_DELAY: Duration = Duration::from_secs(600); pub(crate) const BLOCK_GC_DELAY: Duration = Duration::from_secs(600);
const BLOCK_READ_SEMAPHORE_TIMEOUT: Duration = Duration::from_secs(15);
/// RPC messages used to share blocks of data between nodes /// RPC messages used to share blocks of data between nodes
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
pub enum BlockRpc { pub enum BlockRpc {
@ -89,7 +88,6 @@ pub struct BlockManager {
disable_scrub: bool, disable_scrub: bool,
mutation_lock: Vec<Mutex<BlockManagerLocked>>, mutation_lock: Vec<Mutex<BlockManagerLocked>>,
read_semaphore: Semaphore,
pub rc: BlockRc, pub rc: BlockRc,
pub resync: BlockResyncManager, pub resync: BlockResyncManager,
@ -179,8 +177,6 @@ impl BlockManager {
.iter() .iter()
.map(|_| Mutex::new(BlockManagerLocked())) .map(|_| Mutex::new(BlockManagerLocked()))
.collect::<Vec<_>>(), .collect::<Vec<_>>(),
read_semaphore: Semaphore::new(config.block_max_concurrent_reads),
rc, rc,
resync, resync,
system, system,
@ -300,6 +296,7 @@ impl BlockManager {
&node_id, &node_id,
BlockRpc::GetBlock(*hash, order_tag), BlockRpc::GetBlock(*hash, order_tag),
priority, priority,
RpcInFlightLimiter::TableWrite,
); );
tokio::select! { tokio::select! {
res = rpc => { res = rpc => {
@ -413,8 +410,8 @@ impl BlockManager {
} }
/// Get number of items in the refcount table /// Get number of items in the refcount table
pub fn rc_approximate_len(&self) -> Result<usize, Error> { pub fn rc_len(&self) -> Result<usize, Error> {
Ok(self.rc.rc_table.approximate_len()?) Ok(self.rc.rc_table.len()?)
} }
/// Send command to start/stop/manager scrub worker /// Send command to start/stop/manager scrub worker
@ -432,7 +429,7 @@ impl BlockManager {
/// List all resync errors /// List all resync errors
pub fn list_resync_errors(&self) -> Result<Vec<BlockResyncErrorInfo>, Error> { pub fn list_resync_errors(&self) -> Result<Vec<BlockResyncErrorInfo>, Error> {
let mut blocks = Vec::with_capacity(self.resync.errors.approximate_len()?); let mut blocks = Vec::with_capacity(self.resync.errors.len()?);
for ent in self.resync.errors.iter()? { for ent in self.resync.errors.iter()? {
let (hash, cnt) = ent?; let (hash, cnt) = ent?;
let cnt = ErrorCounter::decode(&cnt); let cnt = ErrorCounter::decode(&cnt);
@ -562,6 +559,9 @@ impl BlockManager {
match self.find_block(hash).await { match self.find_block(hash).await {
Some(p) => self.read_block_from(hash, &p).await, Some(p) => self.read_block_from(hash, &p).await,
None => { None => {
// Not found but maybe we should have had it ??
self.resync
.put_to_resync(hash, 2 * self.system.rpc_helper().rpc_timeout())?;
return Err(Error::Message(format!( return Err(Error::Message(format!(
"block {:?} not found on node", "block {:?} not found on node",
hash hash
@ -583,15 +583,6 @@ impl BlockManager {
) -> Result<DataBlock, Error> { ) -> Result<DataBlock, Error> {
let (header, path) = block_path.as_parts_ref(); let (header, path) = block_path.as_parts_ref();
let permit = tokio::select! {
sem = self.read_semaphore.acquire() => sem.ok_or_message("acquire read semaphore")?,
_ = tokio::time::sleep(BLOCK_READ_SEMAPHORE_TIMEOUT) => {
self.metrics.block_read_semaphore_timeouts.add(1);
debug!("read block {:?}: read_semaphore acquire timeout", hash);
return Err(Error::Message("read block: read_semaphore acquire timeout".into()));
}
};
let mut f = fs::File::open(&path).await?; let mut f = fs::File::open(&path).await?;
let mut data = vec![]; let mut data = vec![];
f.read_to_end(&mut data).await?; f.read_to_end(&mut data).await?;
@ -616,8 +607,6 @@ impl BlockManager {
return Err(Error::CorruptData(*hash)); return Err(Error::CorruptData(*hash));
} }
drop(permit);
Ok(data) Ok(data)
} }
@ -783,7 +772,6 @@ impl BlockManagerLocked {
let mut f = fs::File::create(&path_tmp).await?; let mut f = fs::File::create(&path_tmp).await?;
f.write_all(data).await?; f.write_all(data).await?;
f.flush().await?;
mgr.metrics.bytes_written.add(data.len() as u64); mgr.metrics.bytes_written.add(data.len() as u64);
if mgr.data_fsync { if mgr.data_fsync {

View file

@ -22,7 +22,6 @@ pub struct BlockManagerMetrics {
pub(crate) bytes_read: BoundCounter<u64>, pub(crate) bytes_read: BoundCounter<u64>,
pub(crate) block_read_duration: BoundValueRecorder<f64>, pub(crate) block_read_duration: BoundValueRecorder<f64>,
pub(crate) block_read_semaphore_timeouts: BoundCounter<u64>,
pub(crate) bytes_written: BoundCounter<u64>, pub(crate) bytes_written: BoundCounter<u64>,
pub(crate) block_write_duration: BoundValueRecorder<f64>, pub(crate) block_write_duration: BoundValueRecorder<f64>,
pub(crate) delete_counter: BoundCounter<u64>, pub(crate) delete_counter: BoundCounter<u64>,
@ -51,7 +50,7 @@ impl BlockManagerMetrics {
.init(), .init(),
_rc_size: meter _rc_size: meter
.u64_value_observer("block.rc_size", move |observer| { .u64_value_observer("block.rc_size", move |observer| {
if let Ok(value) = rc_tree.approximate_len() { if let Ok(value) = rc_tree.len() {
observer.observe(value as u64, &[]) observer.observe(value as u64, &[])
} }
}) })
@ -59,7 +58,7 @@ impl BlockManagerMetrics {
.init(), .init(),
_resync_queue_len: meter _resync_queue_len: meter
.u64_value_observer("block.resync_queue_length", move |observer| { .u64_value_observer("block.resync_queue_length", move |observer| {
if let Ok(value) = resync_queue.approximate_len() { if let Ok(value) = resync_queue.len() {
observer.observe(value as u64, &[]); observer.observe(value as u64, &[]);
} }
}) })
@ -69,7 +68,7 @@ impl BlockManagerMetrics {
.init(), .init(),
_resync_errored_blocks: meter _resync_errored_blocks: meter
.u64_value_observer("block.resync_errored_blocks", move |observer| { .u64_value_observer("block.resync_errored_blocks", move |observer| {
if let Ok(value) = resync_errors.approximate_len() { if let Ok(value) = resync_errors.len() {
observer.observe(value as u64, &[]); observer.observe(value as u64, &[]);
} }
}) })
@ -120,11 +119,6 @@ impl BlockManagerMetrics {
.with_description("Duration of block read operations") .with_description("Duration of block read operations")
.init() .init()
.bind(&[]), .bind(&[]),
block_read_semaphore_timeouts: meter
.u64_counter("block.read_semaphore_timeouts")
.with_description("Number of block reads that failed due to semaphore acquire timeout")
.init()
.bind(&[]),
bytes_written: meter bytes_written: meter
.u64_counter("block.bytes_written") .u64_counter("block.bytes_written")
.with_description("Number of bytes written to disk") .with_description("Number of bytes written to disk")

View file

@ -106,13 +106,13 @@ impl BlockResyncManager {
} }
/// Get length of resync queue /// Get length of resync queue
pub fn queue_approximate_len(&self) -> Result<usize, Error> { pub fn queue_len(&self) -> Result<usize, Error> {
Ok(self.queue.approximate_len()?) Ok(self.queue.len()?)
} }
/// Get number of blocks that have an error /// Get number of blocks that have an error
pub fn errors_approximate_len(&self) -> Result<usize, Error> { pub fn errors_len(&self) -> Result<usize, Error> {
Ok(self.errors.approximate_len()?) Ok(self.errors.len()?)
} }
/// Clear the error counter for a block and put it in queue immediately /// Clear the error counter for a block and put it in queue immediately
@ -133,14 +133,6 @@ impl BlockResyncManager {
))) )))
} }
/// Clear the entire resync queue and list of errored blocks
/// Corresponds to `garage repair clear-resync-queue`
pub fn clear_resync_queue(&self) -> Result<(), Error> {
self.queue.clear()?;
self.errors.clear()?;
Ok(())
}
pub fn register_bg_vars(&self, vars: &mut vars::BgVars) { pub fn register_bg_vars(&self, vars: &mut vars::BgVars) {
let notify = self.notify.clone(); let notify = self.notify.clone();
vars.register_rw( vars.register_rw(
@ -556,11 +548,9 @@ impl Worker for ResyncWorker {
} }
WorkerStatus { WorkerStatus {
queue_length: Some(self.manager.resync.queue_approximate_len().unwrap_or(0) as u64), queue_length: Some(self.manager.resync.queue_len().unwrap_or(0) as u64),
tranquility: Some(tranquility), tranquility: Some(tranquility),
persistent_errors: Some( persistent_errors: Some(self.manager.resync.errors_len().unwrap_or(0) as u64),
self.manager.resync.errors_approximate_len().unwrap_or(0) as u64
),
..Default::default() ..Default::default()
} }
} }

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_db" name = "garage_db"
version = "1.3.1" version = "1.1.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"
@ -12,18 +12,14 @@ readme = "../../README.md"
path = "lib.rs" path = "lib.rs"
[dependencies] [dependencies]
thiserror.workspace = true err-derive.workspace = true
tracing.workspace = true tracing.workspace = true
heed = { workspace = true, optional = true } heed = { workspace = true, optional = true }
rusqlite = { workspace = true, optional = true, features = ["backup"] } rusqlite = { workspace = true, optional = true, features = ["backup"] }
r2d2 = { workspace = true, optional = true } r2d2 = { workspace = true, optional = true }
r2d2_sqlite = { workspace = true, optional = true } r2d2_sqlite = { workspace = true, optional = true }
fjall = { workspace = true, optional = true }
parking_lot = { workspace = true, optional = true }
[dev-dependencies] [dev-dependencies]
mktemp.workspace = true mktemp.workspace = true
@ -31,5 +27,4 @@ mktemp.workspace = true
default = [ "lmdb", "sqlite" ] default = [ "lmdb", "sqlite" ]
bundled-libs = [ "rusqlite?/bundled" ] bundled-libs = [ "rusqlite?/bundled" ]
lmdb = [ "heed" ] lmdb = [ "heed" ]
fjall = [ "dep:fjall", "dep:parking_lot" ]
sqlite = [ "rusqlite", "r2d2", "r2d2_sqlite" ] sqlite = [ "rusqlite", "r2d2", "r2d2_sqlite" ]

View file

@ -1,453 +0,0 @@
use core::ops::Bound;
use std::path::PathBuf;
use std::sync::Arc;
use parking_lot::{MappedRwLockReadGuard, RwLock, RwLockReadGuard};
use fjall::{
PartitionCreateOptions, PersistMode, TransactionalKeyspace, TransactionalPartitionHandle,
WriteTransaction,
};
use crate::{
open::{Engine, OpenOpt},
Db, Error, IDb, ITx, ITxFn, OnCommit, Result, TxError, TxFnResult, TxOpError, TxOpResult,
TxResult, TxValueIter, Value, ValueIter,
};
pub use fjall;
// --
pub(crate) fn open_db(path: &PathBuf, opt: &OpenOpt) -> Result<Db> {
info!("Opening Fjall database at: {}", path.display());
if opt.fsync {
return Err(Error(
"metadata_fsync is not supported with the Fjall database engine".into(),
));
}
let mut config = fjall::Config::new(path);
if let Some(block_cache_size) = opt.fjall_block_cache_size {
config = config.cache_size(block_cache_size as u64);
}
let keyspace = config.open_transactional()?;
Ok(FjallDb::init(keyspace))
}
// -- err
impl From<fjall::Error> for Error {
fn from(e: fjall::Error) -> Error {
Error(format!("fjall: {}", e).into())
}
}
impl From<fjall::LsmError> for Error {
fn from(e: fjall::LsmError) -> Error {
Error(format!("fjall lsm_tree: {}", e).into())
}
}
impl From<fjall::Error> for TxOpError {
fn from(e: fjall::Error) -> TxOpError {
TxOpError(e.into())
}
}
// -- db
pub struct FjallDb {
keyspace: TransactionalKeyspace,
trees: RwLock<Vec<(String, TransactionalPartitionHandle)>>,
}
type ByteRefRangeBound<'r> = (Bound<&'r [u8]>, Bound<&'r [u8]>);
impl FjallDb {
pub fn init(keyspace: TransactionalKeyspace) -> Db {
let s = Self {
keyspace,
trees: RwLock::new(Vec::new()),
};
Db(Arc::new(s))
}
fn get_tree(
&self,
i: usize,
) -> Result<MappedRwLockReadGuard<'_, TransactionalPartitionHandle>> {
RwLockReadGuard::try_map(self.trees.read(), |trees: &Vec<_>| {
trees.get(i).map(|tup| &tup.1)
})
.map_err(|_| Error("invalid tree id".into()))
}
}
impl IDb for FjallDb {
fn engine(&self) -> String {
"Fjall (EXPERIMENTAL!)".into()
}
fn open_tree(&self, name: &str) -> Result<usize> {
let mut trees = self.trees.write();
let safe_name = encode_name(name)?;
if let Some(i) = trees.iter().position(|(name, _)| *name == safe_name) {
Ok(i)
} else {
let tree = self
.keyspace
.open_partition(&safe_name, PartitionCreateOptions::default())?;
let i = trees.len();
trees.push((safe_name, tree));
Ok(i)
}
}
fn list_trees(&self) -> Result<Vec<String>> {
Ok(self
.keyspace
.list_partitions()
.iter()
.map(|n| decode_name(&n))
.collect::<Result<Vec<_>>>()?)
}
fn snapshot(&self, base_path: &PathBuf) -> Result<()> {
std::fs::create_dir_all(base_path)?;
let path = Engine::Fjall.db_path(base_path);
let source_state = self.keyspace.read_tx();
let copy_keyspace = fjall::Config::new(path).open()?;
for partition_name in self.keyspace.list_partitions() {
let source_partition = self
.keyspace
.open_partition(&partition_name, PartitionCreateOptions::default())?;
let copy_partition =
copy_keyspace.open_partition(&partition_name, PartitionCreateOptions::default())?;
for entry in source_state.iter(&source_partition) {
let (key, value) = entry?;
copy_partition.insert(key, value)?;
}
}
copy_keyspace.persist(PersistMode::SyncAll)?;
Ok(())
}
// ----
fn get(&self, tree_idx: usize, key: &[u8]) -> Result<Option<Value>> {
let tree = self.get_tree(tree_idx)?;
let tx = self.keyspace.read_tx();
let val = tx.get(&tree, key)?;
match val {
None => Ok(None),
Some(v) => Ok(Some(v.to_vec())),
}
}
fn approximate_len(&self, tree_idx: usize) -> Result<usize> {
let tree = self.get_tree(tree_idx)?;
Ok(tree.approximate_len())
}
fn is_empty(&self, tree_idx: usize) -> Result<bool> {
let tree = self.get_tree(tree_idx)?;
let tx = self.keyspace.read_tx();
Ok(tx.is_empty(&tree)?)
}
fn insert(&self, tree_idx: usize, key: &[u8], value: &[u8]) -> Result<()> {
let tree = self.get_tree(tree_idx)?;
let mut tx = self.keyspace.write_tx();
tx.insert(&tree, key, value);
tx.commit()?;
Ok(())
}
fn remove(&self, tree_idx: usize, key: &[u8]) -> Result<()> {
let tree = self.get_tree(tree_idx)?;
let mut tx = self.keyspace.write_tx();
tx.remove(&tree, key);
tx.commit()?;
Ok(())
}
fn clear(&self, tree_idx: usize) -> Result<()> {
let mut trees = self.trees.write();
if tree_idx >= trees.len() {
return Err(Error("invalid tree id".into()));
}
let (name, tree) = trees.remove(tree_idx);
self.keyspace.delete_partition(tree)?;
let tree = self
.keyspace
.open_partition(&name, PartitionCreateOptions::default())?;
trees.insert(tree_idx, (name, tree));
Ok(())
}
fn iter(&self, tree_idx: usize) -> Result<ValueIter<'_>> {
let tree = self.get_tree(tree_idx)?;
let tx = self.keyspace.read_tx();
Ok(Box::new(tx.iter(&tree).map(iterator_remap)))
}
fn iter_rev(&self, tree_idx: usize) -> Result<ValueIter<'_>> {
let tree = self.get_tree(tree_idx)?;
let tx = self.keyspace.read_tx();
Ok(Box::new(tx.iter(&tree).rev().map(iterator_remap)))
}
fn range<'r>(
&self,
tree_idx: usize,
low: Bound<&'r [u8]>,
high: Bound<&'r [u8]>,
) -> Result<ValueIter<'_>> {
let tree = self.get_tree(tree_idx)?;
let tx = self.keyspace.read_tx();
Ok(Box::new(
tx.range::<&'r [u8], ByteRefRangeBound>(&tree, (low, high))
.map(iterator_remap),
))
}
fn range_rev<'r>(
&self,
tree_idx: usize,
low: Bound<&'r [u8]>,
high: Bound<&'r [u8]>,
) -> Result<ValueIter<'_>> {
let tree = self.get_tree(tree_idx)?;
let tx = self.keyspace.read_tx();
Ok(Box::new(
tx.range::<&'r [u8], ByteRefRangeBound>(&tree, (low, high))
.rev()
.map(iterator_remap),
))
}
// ----
fn transaction(&self, f: &dyn ITxFn) -> TxResult<OnCommit, ()> {
let trees = self.trees.read();
let mut tx = FjallTx {
trees: &trees[..],
tx: self.keyspace.write_tx(),
};
let res = f.try_on(&mut tx);
match res {
TxFnResult::Ok(on_commit) => {
tx.tx.commit().map_err(Error::from).map_err(TxError::Db)?;
Ok(on_commit)
}
TxFnResult::Abort => {
tx.tx.rollback();
Err(TxError::Abort(()))
}
TxFnResult::DbErr => {
tx.tx.rollback();
Err(TxError::Db(Error(
"(this message will be discarded)".into(),
)))
}
}
}
}
// ----
struct FjallTx<'a> {
trees: &'a [(String, TransactionalPartitionHandle)],
tx: WriteTransaction<'a>,
}
impl<'a> FjallTx<'a> {
fn get_tree(&self, i: usize) -> TxOpResult<&TransactionalPartitionHandle> {
self.trees.get(i).map(|tup| &tup.1).ok_or_else(|| {
TxOpError(Error(
"invalid tree id (it might have been openned after the transaction started)".into(),
))
})
}
}
impl<'a> ITx for FjallTx<'a> {
fn get(&self, tree_idx: usize, key: &[u8]) -> TxOpResult<Option<Value>> {
let tree = self.get_tree(tree_idx)?;
match self.tx.get(tree, key)? {
Some(v) => Ok(Some(v.to_vec())),
None => Ok(None),
}
}
fn len(&self, tree_idx: usize) -> TxOpResult<usize> {
let tree = self.get_tree(tree_idx)?;
Ok(self.tx.len(tree)? as usize)
}
fn insert(&mut self, tree_idx: usize, key: &[u8], value: &[u8]) -> TxOpResult<()> {
let tree = self.get_tree(tree_idx)?.clone();
self.tx.insert(&tree, key, value);
Ok(())
}
fn remove(&mut self, tree_idx: usize, key: &[u8]) -> TxOpResult<()> {
let tree = self.get_tree(tree_idx)?.clone();
self.tx.remove(&tree, key);
Ok(())
}
fn clear(&mut self, _tree_idx: usize) -> TxOpResult<()> {
unimplemented!("LSM tree clearing in cross-partition transaction is not supported")
}
fn iter(&self, tree_idx: usize) -> TxOpResult<TxValueIter<'_>> {
let tree = self.get_tree(tree_idx)?.clone();
Ok(Box::new(self.tx.iter(&tree).map(iterator_remap_tx)))
}
fn iter_rev(&self, tree_idx: usize) -> TxOpResult<TxValueIter<'_>> {
let tree = self.get_tree(tree_idx)?.clone();
Ok(Box::new(self.tx.iter(&tree).rev().map(iterator_remap_tx)))
}
fn range<'r>(
&self,
tree_idx: usize,
low: Bound<&'r [u8]>,
high: Bound<&'r [u8]>,
) -> TxOpResult<TxValueIter<'_>> {
let tree = self.get_tree(tree_idx)?;
let low = clone_bound(low);
let high = clone_bound(high);
Ok(Box::new(
self.tx
.range::<Vec<u8>, ByteVecRangeBounds>(&tree, (low, high))
.map(iterator_remap_tx),
))
}
fn range_rev<'r>(
&self,
tree_idx: usize,
low: Bound<&'r [u8]>,
high: Bound<&'r [u8]>,
) -> TxOpResult<TxValueIter<'_>> {
let tree = self.get_tree(tree_idx)?;
let low = clone_bound(low);
let high = clone_bound(high);
Ok(Box::new(
self.tx
.range::<Vec<u8>, ByteVecRangeBounds>(&tree, (low, high))
.rev()
.map(iterator_remap_tx),
))
}
}
// -- maps fjall's (k, v) to ours
fn iterator_remap(r: fjall::Result<(fjall::Slice, fjall::Slice)>) -> Result<(Value, Value)> {
r.map(|(k, v)| (k.to_vec(), v.to_vec()))
.map_err(|e| e.into())
}
fn iterator_remap_tx(r: fjall::Result<(fjall::Slice, fjall::Slice)>) -> TxOpResult<(Value, Value)> {
r.map(|(k, v)| (k.to_vec(), v.to_vec()))
.map_err(|e| e.into())
}
// -- utils to deal with Garage's tightness on Bound lifetimes
type ByteVecBound = Bound<Vec<u8>>;
type ByteVecRangeBounds = (ByteVecBound, ByteVecBound);
fn clone_bound(bound: Bound<&[u8]>) -> ByteVecBound {
let value = match bound {
Bound::Excluded(v) | Bound::Included(v) => v.to_vec(),
Bound::Unbounded => vec![],
};
match bound {
Bound::Included(_) => Bound::Included(value),
Bound::Excluded(_) => Bound::Excluded(value),
Bound::Unbounded => Bound::Unbounded,
}
}
// -- utils to encode table names --
fn encode_name(s: &str) -> Result<String> {
let base = 'A' as u32;
let mut ret = String::with_capacity(s.len() + 10);
for c in s.chars() {
if c.is_alphanumeric() || c == '_' || c == '-' || c == '#' {
ret.push(c);
} else if c <= u8::MAX as char {
ret.push('$');
let c_hi = c as u32 / 16;
let c_lo = c as u32 % 16;
ret.push(char::from_u32(base + c_hi).unwrap());
ret.push(char::from_u32(base + c_lo).unwrap());
} else {
return Err(Error(
format!("table name {} could not be safely encoded", s).into(),
));
}
}
Ok(ret)
}
fn decode_name(s: &str) -> Result<String> {
use std::convert::TryFrom;
let errfn = || Error(format!("encoded table name {} is invalid", s).into());
let c_map = |c: char| {
let c = c as u32;
let base = 'A' as u32;
if (base..base + 16).contains(&c) {
Some(c - base)
} else {
None
}
};
let mut ret = String::with_capacity(s.len());
let mut it = s.chars();
while let Some(c) = it.next() {
if c == '$' {
let c_hi = it.next().and_then(c_map).ok_or_else(errfn)?;
let c_lo = it.next().and_then(c_map).ok_or_else(errfn)?;
let c_dec = char::try_from(c_hi * 16 + c_lo).map_err(|_| errfn())?;
ret.push(c_dec);
} else {
ret.push(c);
}
}
Ok(ret)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_encdec_name() {
for name in [
"testname",
"test_name",
"test name",
"test$name",
"test:name@help.me$get/this**right",
] {
let encname = encode_name(name).unwrap();
assert!(!encname.contains(' '));
assert!(!encname.contains('.'));
assert!(!encname.contains('*'));
assert_eq!(*name, decode_name(&encname).unwrap());
}
}
}

View file

@ -1,8 +1,6 @@
#[macro_use] #[macro_use]
extern crate tracing; extern crate tracing;
#[cfg(feature = "fjall")]
pub mod fjall_adapter;
#[cfg(feature = "lmdb")] #[cfg(feature = "lmdb")]
pub mod lmdb_adapter; pub mod lmdb_adapter;
#[cfg(feature = "sqlite")] #[cfg(feature = "sqlite")]
@ -20,7 +18,7 @@ use std::cell::Cell;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Arc; use std::sync::Arc;
use thiserror::Error; use err_derive::Error;
pub use open::*; pub use open::*;
@ -44,7 +42,7 @@ pub type TxValueIter<'a> = Box<dyn std::iter::Iterator<Item = TxOpResult<(Value,
// ---- // ----
#[derive(Debug, Error)] #[derive(Debug, Error)]
#[error("{0}")] #[error(display = "{}", _0)]
pub struct Error(pub Cow<'static, str>); pub struct Error(pub Cow<'static, str>);
impl From<std::io::Error> for Error { impl From<std::io::Error> for Error {
@ -56,7 +54,7 @@ impl From<std::io::Error> for Error {
pub type Result<T> = std::result::Result<T, Error>; pub type Result<T> = std::result::Result<T, Error>;
#[derive(Debug, Error)] #[derive(Debug, Error)]
#[error("{0}")] #[error(display = "{}", _0)]
pub struct TxOpError(pub(crate) Error); pub struct TxOpError(pub(crate) Error);
pub type TxOpResult<T> = std::result::Result<T, TxOpError>; pub type TxOpResult<T> = std::result::Result<T, TxOpError>;
@ -106,44 +104,32 @@ impl Db {
result: Cell::new(None), result: Cell::new(None),
}; };
let tx_res = self.0.transaction(&f); let tx_res = self.0.transaction(&f);
let fn_res = f.result.into_inner(); let ret = f
.result
.into_inner()
.expect("Transaction did not store result");
match (tx_res, fn_res) { match tx_res {
(Ok(on_commit), Some(Ok(value))) => { Ok(on_commit) => match ret {
// Transaction succeeded Ok(value) => {
// TxFn stored the value to return to the user in fn_res on_commit.into_iter().for_each(|f| f());
// tx_res contains the on_commit list of callbacks, run them now Ok(value)
on_commit.into_iter().for_each(|f| f()); }
Ok(value) _ => unreachable!(),
} },
(Err(TxError::Abort(())), Some(Err(TxError::Abort(e)))) => { Err(TxError::Abort(())) => match ret {
// Transaction was aborted by user code Err(TxError::Abort(e)) => Err(TxError::Abort(e)),
// The abort error value is stored in fn_res _ => unreachable!(),
Err(TxError::Abort(e)) },
} Err(TxError::Db(e2)) => match ret {
(Err(TxError::Db(_tx_e)), Some(Err(TxError::Db(fn_e)))) => { // Ok was stored -> the error occurred when finalizing
// Transaction encountered a DB error in user code // transaction
// The error value encountered is the one in fn_res, Ok(_) => Err(TxError::Db(e2)),
// tx_res contains only a dummy error message // An error was already stored: that's the one we want to
Err(TxError::Db(fn_e)) // return
} Err(TxError::Db(e)) => Err(TxError::Db(e)),
(Err(TxError::Db(tx_e)), None) => { _ => unreachable!(),
// Transaction encounterred a DB error when initializing the transaction, },
// before user code was called
Err(TxError::Db(tx_e))
}
(Err(TxError::Db(tx_e)), Some(Ok(_))) => {
// Transaction encounterred a DB error when commiting the transaction,
// after user code was called
Err(TxError::Db(tx_e))
}
(tx_res, fn_res) => {
panic!(
"unexpected error case: tx_res={:?}, fn_res={:?}",
tx_res.map(|_| "..."),
fn_res.map(|x| x.map(|_| "...").map_err(|_| "..."))
);
}
} }
} }
@ -166,7 +152,7 @@ impl Db {
let tree_names = other.list_trees()?; let tree_names = other.list_trees()?;
for name in tree_names { for name in tree_names {
let tree = self.open_tree(&name)?; let tree = self.open_tree(&name)?;
if !tree.is_empty()? { if tree.len()? > 0 {
return Err(Error(format!("tree {} already contains data", name).into())); return Err(Error(format!("tree {} already contains data", name).into()));
} }
@ -208,12 +194,8 @@ impl Tree {
self.0.get(self.1, key.as_ref()) self.0.get(self.1, key.as_ref())
} }
#[inline] #[inline]
pub fn approximate_len(&self) -> Result<usize> { pub fn len(&self) -> Result<usize> {
self.0.approximate_len(self.1) self.0.len(self.1)
}
#[inline]
pub fn is_empty(&self) -> Result<bool> {
self.0.is_empty(self.1)
} }
#[inline] #[inline]
@ -351,8 +333,7 @@ pub(crate) trait IDb: Send + Sync {
fn snapshot(&self, path: &PathBuf) -> Result<()>; fn snapshot(&self, path: &PathBuf) -> Result<()>;
fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>; fn get(&self, tree: usize, key: &[u8]) -> Result<Option<Value>>;
fn approximate_len(&self, tree: usize) -> Result<usize>; fn len(&self, tree: usize) -> Result<usize>;
fn is_empty(&self, tree: usize) -> Result<bool>;
fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<()>; fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<()>;
fn remove(&self, tree: usize, key: &[u8]) -> Result<()>; fn remove(&self, tree: usize, key: &[u8]) -> Result<()>;

View file

@ -1,8 +1,8 @@
use core::ops::Bound; use core::ops::Bound;
use core::ptr::NonNull;
use std::collections::HashMap; use std::collections::HashMap;
use std::convert::TryInto; use std::convert::TryInto;
use std::marker::PhantomPinned;
use std::path::PathBuf; use std::path::PathBuf;
use std::pin::Pin; use std::pin::Pin;
use std::sync::{Arc, RwLock}; use std::sync::{Arc, RwLock};
@ -11,55 +11,12 @@ use heed::types::ByteSlice;
use heed::{BytesDecode, Env, RoTxn, RwTxn, UntypedDatabase as Database}; use heed::{BytesDecode, Env, RoTxn, RwTxn, UntypedDatabase as Database};
use crate::{ use crate::{
open::{Engine, OpenOpt},
Db, Error, IDb, ITx, ITxFn, OnCommit, Result, TxError, TxFnResult, TxOpError, TxOpResult, Db, Error, IDb, ITx, ITxFn, OnCommit, Result, TxError, TxFnResult, TxOpError, TxOpResult,
TxResult, TxValueIter, Value, ValueIter, TxResult, TxValueIter, Value, ValueIter,
}; };
pub use heed; pub use heed;
// ---- top-level open function
pub(crate) fn open_db(path: &PathBuf, opt: &OpenOpt) -> Result<Db> {
info!("Opening LMDB database at: {}", path.display());
if let Err(e) = std::fs::create_dir_all(&path) {
return Err(Error(
format!("Unable to create LMDB data directory: {}", e).into(),
));
}
let map_size = match opt.lmdb_map_size {
None => recommended_map_size(),
Some(v) => v - (v % 4096),
};
let mut env_builder = heed::EnvOpenOptions::new();
env_builder.max_dbs(100);
env_builder.map_size(map_size);
env_builder.max_readers(2048);
unsafe {
env_builder.flag(heed::flags::Flags::MdbNoRdAhead);
env_builder.flag(heed::flags::Flags::MdbNoMetaSync);
if !opt.fsync {
env_builder.flag(heed::flags::Flags::MdbNoSync);
}
}
match env_builder.open(&path) {
Err(heed::Error::Io(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
return Err(Error(
"OutOfMemory error while trying to open LMDB database. This can happen \
if your operating system is not allowing you to use sufficient virtual \
memory address space. Please check that no limit is set (ulimit -v). \
You may also try to set a smaller `lmdb_map_size` configuration parameter. \
On 32-bit machines, you should probably switch to another database engine."
.into(),
))
}
Err(e) => Err(Error(format!("Cannot open LMDB database: {}", e).into())),
Ok(db) => Ok(LmdbDb::init(db)),
}
}
// -- err // -- err
impl From<heed::Error> for Error { impl From<heed::Error> for Error {
@ -147,9 +104,10 @@ impl IDb for LmdbDb {
Ok(ret2) Ok(ret2)
} }
fn snapshot(&self, base_path: &PathBuf) -> Result<()> { fn snapshot(&self, to: &PathBuf) -> Result<()> {
std::fs::create_dir_all(base_path)?; std::fs::create_dir_all(to)?;
let path = Engine::Lmdb.db_path(base_path); let mut path = to.clone();
path.push("data.mdb");
self.db self.db
.copy_to_path(path, heed::CompactionOption::Enabled)?; .copy_to_path(path, heed::CompactionOption::Enabled)?;
Ok(()) Ok(())
@ -168,16 +126,11 @@ impl IDb for LmdbDb {
} }
} }
fn approximate_len(&self, tree: usize) -> Result<usize> { fn len(&self, tree: usize) -> Result<usize> {
let tree = self.get_tree(tree)?; let tree = self.get_tree(tree)?;
let tx = self.db.read_txn()?; let tx = self.db.read_txn()?;
Ok(tree.len(&tx)?.try_into().unwrap()) Ok(tree.len(&tx)?.try_into().unwrap())
} }
fn is_empty(&self, tree: usize) -> Result<bool> {
let tree = self.get_tree(tree)?;
let tx = self.db.read_txn()?;
Ok(tree.is_empty(&tx)?)
}
fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<()> { fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<()> {
let tree = self.get_tree(tree)?; let tree = self.get_tree(tree)?;
@ -206,15 +159,13 @@ impl IDb for LmdbDb {
fn iter(&self, tree: usize) -> Result<ValueIter<'_>> { fn iter(&self, tree: usize) -> Result<ValueIter<'_>> {
let tree = self.get_tree(tree)?; let tree = self.get_tree(tree)?;
let tx = self.db.read_txn()?; let tx = self.db.read_txn()?;
// Safety: the cloture does not store its argument anywhere, TxAndIterator::make(tx, |tx| Ok(tree.iter(tx)?))
unsafe { TxAndIterator::make(tx, |tx| Ok(tree.iter(tx)?)) }
} }
fn iter_rev(&self, tree: usize) -> Result<ValueIter<'_>> { fn iter_rev(&self, tree: usize) -> Result<ValueIter<'_>> {
let tree = self.get_tree(tree)?; let tree = self.get_tree(tree)?;
let tx = self.db.read_txn()?; let tx = self.db.read_txn()?;
// Safety: the cloture does not store its argument anywhere, TxAndIterator::make(tx, |tx| Ok(tree.rev_iter(tx)?))
unsafe { TxAndIterator::make(tx, |tx| Ok(tree.rev_iter(tx)?)) }
} }
fn range<'r>( fn range<'r>(
@ -225,8 +176,7 @@ impl IDb for LmdbDb {
) -> Result<ValueIter<'_>> { ) -> Result<ValueIter<'_>> {
let tree = self.get_tree(tree)?; let tree = self.get_tree(tree)?;
let tx = self.db.read_txn()?; let tx = self.db.read_txn()?;
// Safety: the cloture does not store its argument anywhere, TxAndIterator::make(tx, |tx| Ok(tree.range(tx, &(low, high))?))
unsafe { TxAndIterator::make(tx, |tx| Ok(tree.range(tx, &(low, high))?)) }
} }
fn range_rev<'r>( fn range_rev<'r>(
&self, &self,
@ -236,8 +186,7 @@ impl IDb for LmdbDb {
) -> Result<ValueIter<'_>> { ) -> Result<ValueIter<'_>> {
let tree = self.get_tree(tree)?; let tree = self.get_tree(tree)?;
let tx = self.db.read_txn()?; let tx = self.db.read_txn()?;
// Safety: the cloture does not store its argument anywhere, TxAndIterator::make(tx, |tx| Ok(tree.rev_range(tx, &(low, high))?))
unsafe { TxAndIterator::make(tx, |tx| Ok(tree.rev_range(tx, &(low, high))?)) }
} }
// ---- // ----
@ -367,41 +316,28 @@ where
{ {
tx: RoTxn<'a>, tx: RoTxn<'a>,
iter: Option<I>, iter: Option<I>,
_pin: PhantomPinned,
} }
impl<'a, I> TxAndIterator<'a, I> impl<'a, I> TxAndIterator<'a, I>
where where
I: Iterator<Item = IteratorItem<'a>> + 'a, I: Iterator<Item = IteratorItem<'a>> + 'a,
{ {
fn iter(self: Pin<&mut Self>) -> &mut Option<I> { fn make<F>(tx: RoTxn<'a>, iterfun: F) -> Result<ValueIter<'a>>
// Safety: iter is not structural
unsafe { &mut self.get_unchecked_mut().iter }
}
/// Safety: iterfun must not store its argument anywhere but in its result.
unsafe fn make<F>(tx: RoTxn<'a>, iterfun: F) -> Result<ValueIter<'a>>
where where
F: FnOnce(&'a RoTxn<'a>) -> Result<I>, F: FnOnce(&'a RoTxn<'a>) -> Result<I>,
{ {
let res = TxAndIterator { let res = TxAndIterator { tx, iter: None };
tx,
iter: None,
_pin: PhantomPinned,
};
let mut boxed = Box::pin(res); let mut boxed = Box::pin(res);
let tx_lifetime_overextended: &'a RoTxn<'a> = { // This unsafe allows us to bypass lifetime checks
let tx = &boxed.tx; let tx = unsafe { NonNull::from(&boxed.tx).as_ref() };
// Safety: Artificially extending the lifetime because let iter = iterfun(tx)?;
// this reference will only be stored and accessed from the
// returned ValueIter which guarantees that it is destroyed
// before the tx it is pointing to.
unsafe { &*&raw const *tx }
};
let iter = iterfun(&tx_lifetime_overextended)?;
*boxed.as_mut().iter() = Some(iter); let mut_ref = Pin::as_mut(&mut boxed);
// This unsafe allows us to write in a field of the pinned struct
unsafe {
Pin::get_unchecked_mut(mut_ref).iter = Some(iter);
}
Ok(Box::new(TxAndIteratorPin(boxed))) Ok(Box::new(TxAndIteratorPin(boxed)))
} }
@ -412,10 +348,8 @@ where
I: Iterator<Item = IteratorItem<'a>> + 'a, I: Iterator<Item = IteratorItem<'a>> + 'a,
{ {
fn drop(&mut self) { fn drop(&mut self) {
// Safety: `new_unchecked` is okay because we know this value is never // ensure the iterator is dropped before the RoTxn it references
// used again after being dropped. drop(self.iter.take());
let this = unsafe { Pin::new_unchecked(self) };
drop(this.iter().take());
} }
} }
@ -431,12 +365,13 @@ where
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
let mut_ref = Pin::as_mut(&mut self.0); let mut_ref = Pin::as_mut(&mut self.0);
let next = mut_ref.iter().as_mut()?.next()?; // This unsafe allows us to mutably access the iterator field
let res = match next { let next = unsafe { Pin::get_unchecked_mut(mut_ref).iter.as_mut()?.next() };
Err(e) => Err(e.into()), match next {
Ok((k, v)) => Ok((k.to_vec(), v.to_vec())), None => None,
}; Some(Err(e)) => Some(Err(e.into())),
Some(res) Some(Ok((k, v))) => Some(Ok((k.to_vec(), v.to_vec()))),
}
} }
} }

View file

@ -11,7 +11,6 @@ use crate::{Db, Error, Result};
pub enum Engine { pub enum Engine {
Lmdb, Lmdb,
Sqlite, Sqlite,
Fjall,
} }
impl Engine { impl Engine {
@ -20,26 +19,8 @@ impl Engine {
match self { match self {
Self::Lmdb => "lmdb", Self::Lmdb => "lmdb",
Self::Sqlite => "sqlite", Self::Sqlite => "sqlite",
Self::Fjall => "fjall",
} }
} }
/// Return engine-specific DB path from base path
pub fn db_path(&self, base_path: &PathBuf) -> PathBuf {
let mut ret = base_path.clone();
match self {
Self::Lmdb => {
ret.push("db.lmdb");
}
Self::Sqlite => {
ret.push("db.sqlite");
}
Self::Fjall => {
ret.push("db.fjall");
}
}
ret
}
} }
impl std::fmt::Display for Engine { impl std::fmt::Display for Engine {
@ -55,11 +36,10 @@ impl std::str::FromStr for Engine {
match text { match text {
"lmdb" | "heed" => Ok(Self::Lmdb), "lmdb" | "heed" => Ok(Self::Lmdb),
"sqlite" | "sqlite3" | "rusqlite" => Ok(Self::Sqlite), "sqlite" | "sqlite3" | "rusqlite" => Ok(Self::Sqlite),
"fjall" => Ok(Self::Fjall),
"sled" => Err(Error("Sled is no longer supported as a database engine. Converting your old metadata db can be done using an older Garage binary (e.g. v0.9.4).".into())), "sled" => Err(Error("Sled is no longer supported as a database engine. Converting your old metadata db can be done using an older Garage binary (e.g. v0.9.4).".into())),
kind => Err(Error( kind => Err(Error(
format!( format!(
"Invalid DB engine: {} (options are: lmdb, sqlite, fjall)", "Invalid DB engine: {} (options are: lmdb, sqlite)",
kind kind
) )
.into(), .into(),
@ -71,7 +51,6 @@ impl std::str::FromStr for Engine {
pub struct OpenOpt { pub struct OpenOpt {
pub fsync: bool, pub fsync: bool,
pub lmdb_map_size: Option<usize>, pub lmdb_map_size: Option<usize>,
pub fjall_block_cache_size: Option<usize>,
} }
impl Default for OpenOpt { impl Default for OpenOpt {
@ -79,7 +58,6 @@ impl Default for OpenOpt {
Self { Self {
fsync: false, fsync: false,
lmdb_map_size: None, lmdb_map_size: None,
fjall_block_cache_size: None,
} }
} }
} }
@ -88,15 +66,53 @@ pub fn open_db(path: &PathBuf, engine: Engine, opt: &OpenOpt) -> Result<Db> {
match engine { match engine {
// ---- Sqlite DB ---- // ---- Sqlite DB ----
#[cfg(feature = "sqlite")] #[cfg(feature = "sqlite")]
Engine::Sqlite => crate::sqlite_adapter::open_db(path, opt), Engine::Sqlite => {
info!("Opening Sqlite database at: {}", path.display());
let manager = r2d2_sqlite::SqliteConnectionManager::file(path);
Ok(crate::sqlite_adapter::SqliteDb::new(manager, opt.fsync)?)
}
// ---- LMDB DB ---- // ---- LMDB DB ----
#[cfg(feature = "lmdb")] #[cfg(feature = "lmdb")]
Engine::Lmdb => crate::lmdb_adapter::open_db(path, opt), Engine::Lmdb => {
info!("Opening LMDB database at: {}", path.display());
if let Err(e) = std::fs::create_dir_all(&path) {
return Err(Error(
format!("Unable to create LMDB data directory: {}", e).into(),
));
}
// ---- Fjall DB ---- let map_size = match opt.lmdb_map_size {
#[cfg(feature = "fjall")] None => crate::lmdb_adapter::recommended_map_size(),
Engine::Fjall => crate::fjall_adapter::open_db(path, opt), Some(v) => v - (v % 4096),
};
let mut env_builder = heed::EnvOpenOptions::new();
env_builder.max_dbs(100);
env_builder.map_size(map_size);
env_builder.max_readers(2048);
unsafe {
env_builder.flag(crate::lmdb_adapter::heed::flags::Flags::MdbNoRdAhead);
env_builder.flag(crate::lmdb_adapter::heed::flags::Flags::MdbNoMetaSync);
if !opt.fsync {
env_builder.flag(heed::flags::Flags::MdbNoSync);
}
}
match env_builder.open(&path) {
Err(heed::Error::Io(e)) if e.kind() == std::io::ErrorKind::OutOfMemory => {
return Err(Error(
"OutOfMemory error while trying to open LMDB database. This can happen \
if your operating system is not allowing you to use sufficient virtual \
memory address space. Please check that no limit is set (ulimit -v). \
You may also try to set a smaller `lmdb_map_size` configuration parameter. \
On 32-bit machines, you should probably switch to another database engine."
.into(),
))
}
Err(e) => Err(Error(format!("Cannot open LMDB database: {}", e).into())),
Ok(db) => Ok(crate::lmdb_adapter::LmdbDb::init(db)),
}
}
// Pattern is unreachable when all supported DB engines are compiled into binary. The allow // Pattern is unreachable when all supported DB engines are compiled into binary. The allow
// attribute is added so that we won't have to change this match in case stop building // attribute is added so that we won't have to change this match in case stop building

View file

@ -11,23 +11,12 @@ use r2d2_sqlite::SqliteConnectionManager;
use rusqlite::{params, Rows, Statement, Transaction}; use rusqlite::{params, Rows, Statement, Transaction};
use crate::{ use crate::{
open::{Engine, OpenOpt},
Db, Error, IDb, ITx, ITxFn, OnCommit, Result, TxError, TxFnResult, TxOpError, TxOpResult, Db, Error, IDb, ITx, ITxFn, OnCommit, Result, TxError, TxFnResult, TxOpError, TxOpResult,
TxResult, TxValueIter, Value, ValueIter, TxResult, TxValueIter, Value, ValueIter,
}; };
pub use rusqlite; pub use rusqlite;
// ---- top-level open function
pub(crate) fn open_db(path: &PathBuf, opt: &OpenOpt) -> Result<Db> {
info!("Opening Sqlite database at: {}", path.display());
let manager = r2d2_sqlite::SqliteConnectionManager::file(path);
Ok(SqliteDb::new(manager, opt.fsync)?)
}
// ----
type Connection = r2d2::PooledConnection<SqliteConnectionManager>; type Connection = r2d2::PooledConnection<SqliteConnectionManager>;
// --- err // --- err
@ -150,18 +139,17 @@ impl IDb for SqliteDb {
Ok(trees) Ok(trees)
} }
fn snapshot(&self, base_path: &PathBuf) -> Result<()> { fn snapshot(&self, to: &PathBuf) -> Result<()> {
std::fs::create_dir_all(base_path)?; fn progress(p: rusqlite::backup::Progress) {
let path = Engine::Sqlite let percent = (p.pagecount - p.remaining) * 100 / p.pagecount;
.db_path(&base_path) info!("Sqlite snapshot progress: {}%", percent);
.into_os_string() }
.into_string() std::fs::create_dir_all(to)?;
.map_err(|_| Error("invalid sqlite path string".into()))?; let mut path = to.clone();
path.push("db.sqlite");
info!("Start sqlite VACUUM INTO `{}`", path); self.db
self.db.get()?.execute("VACUUM INTO ?1", params![path])?; .get()?
info!("Finished sqlite VACUUM INTO `{}`", path); .backup(rusqlite::DatabaseName::Main, path, Some(progress))?;
Ok(()) Ok(())
} }
@ -172,7 +160,7 @@ impl IDb for SqliteDb {
self.internal_get(&self.db.get()?, &tree, key) self.internal_get(&self.db.get()?, &tree, key)
} }
fn approximate_len(&self, tree: usize) -> Result<usize> { fn len(&self, tree: usize) -> Result<usize> {
let tree = self.get_tree(tree)?; let tree = self.get_tree(tree)?;
let db = self.db.get()?; let db = self.db.get()?;
@ -184,10 +172,6 @@ impl IDb for SqliteDb {
} }
} }
fn is_empty(&self, tree: usize) -> Result<bool> {
Ok(self.approximate_len(tree)? == 0)
}
fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<()> { fn insert(&self, tree: usize, key: &[u8], value: &[u8]) -> Result<()> {
let tree = self.get_tree(tree)?; let tree = self.get_tree(tree)?;
let db = self.db.get()?; let db = self.db.get()?;

View file

@ -1,7 +1,7 @@
use crate::*; use crate::*;
fn test_suite(db: Db) { fn test_suite(db: Db) {
let tree = db.open_tree("tree:this_is_a_tree").unwrap(); let tree = db.open_tree("tree").unwrap();
let ka: &[u8] = &b"test"[..]; let ka: &[u8] = &b"test"[..];
let kb: &[u8] = &b"zwello"[..]; let kb: &[u8] = &b"zwello"[..];
@ -14,7 +14,7 @@ fn test_suite(db: Db) {
assert!(tree.insert(ka, va).is_ok()); assert!(tree.insert(ka, va).is_ok());
assert_eq!(tree.get(ka).unwrap().unwrap(), va); assert_eq!(tree.get(ka).unwrap().unwrap(), va);
assert_eq!(tree.iter().unwrap().count(), 1); assert_eq!(tree.len().unwrap(), 1);
// ---- test transaction logic ---- // ---- test transaction logic ----
@ -148,15 +148,3 @@ fn test_sqlite_db() {
let db = SqliteDb::new(manager, false).unwrap(); let db = SqliteDb::new(manager, false).unwrap();
test_suite(db); test_suite(db);
} }
#[test]
#[cfg(feature = "fjall")]
fn test_fjall_db() {
use crate::fjall_adapter::{fjall, FjallDb};
let path = mktemp::Temp::new_dir().unwrap();
let config = fjall::Config::new(path).temporary(true);
let keyspace = config.open_transactional().unwrap();
let db = FjallDb::init(keyspace);
test_suite(db);
}

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage" name = "garage"
version = "1.3.1" version = "1.1.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"
@ -57,7 +57,6 @@ opentelemetry.workspace = true
opentelemetry-prometheus = { workspace = true, optional = true } opentelemetry-prometheus = { workspace = true, optional = true }
opentelemetry-otlp = { workspace = true, optional = true } opentelemetry-otlp = { workspace = true, optional = true }
syslog-tracing = { workspace = true, optional = true } syslog-tracing = { workspace = true, optional = true }
tracing-journald = { workspace = true, optional = true }
[dev-dependencies] [dev-dependencies]
garage_api_common.workspace = true garage_api_common.workspace = true
@ -91,7 +90,6 @@ k2v = [ "garage_util/k2v", "garage_api_k2v" ]
# Database engines # Database engines
lmdb = [ "garage_model/lmdb" ] lmdb = [ "garage_model/lmdb" ]
sqlite = [ "garage_model/sqlite" ] sqlite = [ "garage_model/sqlite" ]
fjall = [ "garage_model/fjall" ]
# Automatic registration and discovery via Consul API # Automatic registration and discovery via Consul API
consul-discovery = [ "garage_rpc/consul-discovery" ] consul-discovery = [ "garage_rpc/consul-discovery" ]
@ -103,8 +101,6 @@ metrics = [ "garage_api_admin/metrics", "opentelemetry-prometheus" ]
telemetry-otlp = [ "opentelemetry-otlp" ] telemetry-otlp = [ "opentelemetry-otlp" ]
# Logging to syslog # Logging to syslog
syslog = [ "syslog-tracing" ] syslog = [ "syslog-tracing" ]
# Logging to journald
journald = [ "tracing-journald" ]
# NOTE: bundled-libs and system-libs should be treat as mutually exclusive; # NOTE: bundled-libs and system-libs should be treat as mutually exclusive;
# exactly one of them should be enabled. # exactly one of them should be enabled.

View file

@ -101,7 +101,6 @@ impl AdminRpcHandler {
let mut obj_dels = 0; let mut obj_dels = 0;
let mut mpu_dels = 0; let mut mpu_dels = 0;
let mut ver_dels = 0; let mut ver_dels = 0;
let mut br_dels = 0;
for hash in blocks { for hash in blocks {
let hash = hex::decode(hash).ok_or_bad_request("invalid hash")?; let hash = hex::decode(hash).ok_or_bad_request("invalid hash")?;
@ -132,19 +131,12 @@ impl AdminRpcHandler {
ver_dels += 1; ver_dels += 1;
} }
} }
if !br.deleted.get() {
let mut br = br;
br.deleted.set();
self.garage.block_ref_table.insert(&br).await?;
br_dels += 1;
}
} }
} }
Ok(AdminRpc::Ok(format!( Ok(AdminRpc::Ok(format!(
"Purged {} blocks: marked {} block refs, {} versions, {} objects and {} multipart uploads as deleted", "Purged {} blocks, {} versions, {} objects, {} multipart uploads",
blocks.len(), blocks.len(),
br_dels,
ver_dels, ver_dels,
obj_dels, obj_dels,
mpu_dels, mpu_dels,

View file

@ -126,7 +126,7 @@ impl AdminRpcHandler {
#[allow(clippy::ptr_arg)] #[allow(clippy::ptr_arg)]
async fn handle_create_bucket(&self, name: &String) -> Result<AdminRpc, Error> { async fn handle_create_bucket(&self, name: &String) -> Result<AdminRpc, Error> {
if !is_valid_bucket_name(name, self.garage.config.allow_punycode) { if !is_valid_bucket_name(name) {
return Err(Error::BadRequest(format!( return Err(Error::BadRequest(format!(
"{}: {}", "{}: {}",
name, INVALID_BUCKET_NAME_MESSAGE name, INVALID_BUCKET_NAME_MESSAGE

View file

@ -13,6 +13,8 @@ use serde::{Deserialize, Serialize};
use format_table::format_table_to_string; use format_table::format_table_to_string;
use garage_net::endpoint::RpcInFlightLimiter;
use garage_util::background::BackgroundRunner; use garage_util::background::BackgroundRunner;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::Error as GarageError; use garage_util::error::Error as GarageError;
@ -118,6 +120,7 @@ impl AdminRpcHandler {
&node, &node,
AdminRpc::LaunchRepair(opt_to_send.clone()), AdminRpc::LaunchRepair(opt_to_send.clone()),
PRIO_NORMAL, PRIO_NORMAL,
RpcInFlightLimiter::NoLimit,
) )
.await; .await;
if !matches!(resp, Ok(Ok(_))) { if !matches!(resp, Ok(Ok(_))) {
@ -164,7 +167,12 @@ impl AdminRpcHandler {
let node_id = (*node).into(); let node_id = (*node).into();
match self match self
.endpoint .endpoint
.call(&node_id, AdminRpc::Stats(opt), PRIO_NORMAL) .call(
&node_id,
AdminRpc::Stats(opt),
PRIO_NORMAL,
RpcInFlightLimiter::NoLimit,
)
.await .await
{ {
Ok(Ok(AdminRpc::Ok(s))) => writeln!(&mut ret, "{}", s).unwrap(), Ok(Ok(AdminRpc::Ok(s))) => writeln!(&mut ret, "{}", s).unwrap(),
@ -219,7 +227,7 @@ impl AdminRpcHandler {
// Gather block manager statistics // Gather block manager statistics
writeln!(&mut ret, "\nBlock manager stats:").unwrap(); writeln!(&mut ret, "\nBlock manager stats:").unwrap();
let rc_len = self.garage.block_manager.rc_approximate_len()?.to_string(); let rc_len = self.garage.block_manager.rc_len()?.to_string();
writeln!( writeln!(
&mut ret, &mut ret,
@ -230,13 +238,13 @@ impl AdminRpcHandler {
writeln!( writeln!(
&mut ret, &mut ret,
" resync queue length: {}", " resync queue length: {}",
self.garage.block_manager.resync.queue_approximate_len()? self.garage.block_manager.resync.queue_len()?
) )
.unwrap(); .unwrap();
writeln!( writeln!(
&mut ret, &mut ret,
" blocks with resync errors: {}", " blocks with resync errors: {}",
self.garage.block_manager.resync.errors_approximate_len()? self.garage.block_manager.resync.errors_len()?
) )
.unwrap(); .unwrap();
@ -346,21 +354,16 @@ impl AdminRpcHandler {
F: TableSchema + 'static, F: TableSchema + 'static,
R: TableReplication + 'static, R: TableReplication + 'static,
{ {
let data_len = t let data_len = t.data.store.len().map_err(GarageError::from)?.to_string();
.data let mkl_len = t.merkle_updater.merkle_tree_len()?.to_string();
.store
.approximate_len()
.map_err(GarageError::from)?
.to_string();
let mkl_len = t.merkle_updater.merkle_tree_approximate_len()?.to_string();
Ok(format!( Ok(format!(
" {}\t{}\t{}\t{}\t{}", " {}\t{}\t{}\t{}\t{}",
F::TABLE_NAME, F::TABLE_NAME,
data_len, data_len,
mkl_len, mkl_len,
t.merkle_updater.todo_approximate_len()?, t.merkle_updater.todo_len()?,
t.data.gc_todo_approximate_len()? t.data.gc_todo_len()?
)) ))
} }
@ -412,6 +415,7 @@ impl AdminRpcHandler {
variable: variable.clone(), variable: variable.clone(),
}), }),
PRIO_NORMAL, PRIO_NORMAL,
RpcInFlightLimiter::NoLimit,
) )
.await?? .await??
{ {
@ -461,6 +465,7 @@ impl AdminRpcHandler {
value: value.to_string(), value: value.to_string(),
}), }),
PRIO_NORMAL, PRIO_NORMAL,
RpcInFlightLimiter::NoLimit,
) )
.await?? .await??
{ {
@ -493,6 +498,7 @@ impl AdminRpcHandler {
&to, &to,
AdminRpc::MetaOperation(MetaOperation::Snapshot { all: false }), AdminRpc::MetaOperation(MetaOperation::Snapshot { all: false }),
PRIO_NORMAL, PRIO_NORMAL,
RpcInFlightLimiter::NoLimit,
) )
.await? .await?
})) }))

View file

@ -2,6 +2,7 @@ use std::collections::{HashMap, HashSet};
use std::time::Duration; use std::time::Duration;
use format_table::format_table; use format_table::format_table;
use garage_net::endpoint::RpcInFlightLimiter;
use garage_util::error::*; use garage_util::error::*;
use garage_rpc::layout::*; use garage_rpc::layout::*;
@ -200,7 +201,12 @@ pub async fn cmd_connect(
args: ConnectNodeOpt, args: ConnectNodeOpt,
) -> Result<(), Error> { ) -> Result<(), Error> {
match rpc_cli match rpc_cli
.call(&rpc_host, SystemRpc::Connect(args.node), PRIO_NORMAL) .call(
&rpc_host,
SystemRpc::Connect(args.node),
PRIO_NORMAL,
RpcInFlightLimiter::NoLimit,
)
.await?? .await??
{ {
SystemRpc::Ok => { SystemRpc::Ok => {
@ -216,7 +222,10 @@ pub async fn cmd_admin(
rpc_host: NodeID, rpc_host: NodeID,
args: AdminRpc, args: AdminRpc,
) -> Result<(), HelperError> { ) -> Result<(), HelperError> {
match rpc_cli.call(&rpc_host, args, PRIO_NORMAL).await?? { match rpc_cli
.call(&rpc_host, args, PRIO_NORMAL, RpcInFlightLimiter::NoLimit)
.await??
{
AdminRpc::Ok(msg) => { AdminRpc::Ok(msg) => {
println!("{}", msg); println!("{}", msg);
} }
@ -271,7 +280,12 @@ pub async fn fetch_status(
rpc_host: NodeID, rpc_host: NodeID,
) -> Result<Vec<KnownNodeInfo>, Error> { ) -> Result<Vec<KnownNodeInfo>, Error> {
match rpc_cli match rpc_cli
.call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL) .call(
&rpc_host,
SystemRpc::GetKnownNodes,
PRIO_NORMAL,
RpcInFlightLimiter::NoLimit,
)
.await?? .await??
{ {
SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes), SystemRpc::ReturnKnownNodes(nodes) => Ok(nodes),

View file

@ -1,6 +1,7 @@
use bytesize::ByteSize; use bytesize::ByteSize;
use format_table::format_table; use format_table::format_table;
use garage_net::endpoint::RpcInFlightLimiter;
use garage_util::crdt::Crdt; use garage_util::crdt::Crdt;
use garage_util::error::*; use garage_util::error::*;
@ -45,7 +46,12 @@ pub async fn cmd_assign_role(
args: AssignRoleOpt, args: AssignRoleOpt,
) -> Result<(), Error> { ) -> Result<(), Error> {
let status = match rpc_cli let status = match rpc_cli
.call(&rpc_host, SystemRpc::GetKnownNodes, PRIO_NORMAL) .call(
&rpc_host,
SystemRpc::GetKnownNodes,
PRIO_NORMAL,
RpcInFlightLimiter::NoLimit,
)
.await?? .await??
{ {
SystemRpc::ReturnKnownNodes(nodes) => nodes, SystemRpc::ReturnKnownNodes(nodes) => nodes,
@ -475,7 +481,12 @@ pub async fn fetch_layout(
rpc_host: NodeID, rpc_host: NodeID,
) -> Result<LayoutHistory, Error> { ) -> Result<LayoutHistory, Error> {
match rpc_cli match rpc_cli
.call(&rpc_host, SystemRpc::PullClusterLayout, PRIO_NORMAL) .call(
&rpc_host,
SystemRpc::PullClusterLayout,
PRIO_NORMAL,
RpcInFlightLimiter::NoLimit,
)
.await?? .await??
{ {
SystemRpc::AdvertiseClusterLayout(t) => Ok(t), SystemRpc::AdvertiseClusterLayout(t) => Ok(t),
@ -493,6 +504,7 @@ pub async fn send_layout(
&rpc_host, &rpc_host,
SystemRpc::AdvertiseClusterLayout(layout), SystemRpc::AdvertiseClusterLayout(layout),
PRIO_NORMAL, PRIO_NORMAL,
RpcInFlightLimiter::NoLimit,
) )
.await??; .await??;
Ok(()) Ok(())

View file

@ -466,10 +466,6 @@ pub enum RepairWhat {
/// Repair (resync/rebalance) the set of stored blocks in the cluster /// Repair (resync/rebalance) the set of stored blocks in the cluster
#[structopt(name = "blocks", version = garage_version())] #[structopt(name = "blocks", version = garage_version())]
Blocks, Blocks,
/// Clear the block resync queue. The list of blocks in errored state
/// is cleared as well. You MUST run `garage repair blocks` after invoking this.
#[structopt(name = "clear-resync-queue", version = garage_version())]
ClearResyncQueue,
/// Repropagate object deletions to the version table /// Repropagate object deletions to the version table
#[structopt(name = "versions", version = garage_version())] #[structopt(name = "versions", version = garage_version())]
Versions, Versions,
@ -482,9 +478,6 @@ pub enum RepairWhat {
/// Recalculate block reference counters /// Recalculate block reference counters
#[structopt(name = "block-rc", version = garage_version())] #[structopt(name = "block-rc", version = garage_version())]
BlockRc, BlockRc,
/// Fix inconsistency in bucket aliases (WARNING: EXPERIMENTAL)
#[structopt(name = "aliases", version = garage_version())]
Aliases,
/// Verify integrity of all blocks on disc /// Verify integrity of all blocks on disc
#[structopt(name = "scrub", version = garage_version())] #[structopt(name = "scrub", version = garage_version())]
Scrub { Scrub {

View file

@ -208,43 +208,6 @@ fn init_logging(opt: &Opt) {
} }
} }
if std::env::var("GARAGE_LOG_TO_JOURNALD")
.map(|x| x == "1" || x == "true")
.unwrap_or(false)
{
#[cfg(feature = "journald")]
{
use tracing_journald::{Priority, PriorityMappings};
use tracing_subscriber::layer::SubscriberExt;
use tracing_subscriber::util::SubscriberInitExt;
let registry = tracing_subscriber::registry()
.with(tracing_subscriber::fmt::layer().with_writer(std::io::sink))
.with(env_filter);
match tracing_journald::layer() {
Ok(layer) => {
registry
.with(layer.with_priority_mappings(PriorityMappings {
info: Priority::Informational,
debug: Priority::Debug,
..PriorityMappings::new()
}))
.init();
}
Err(e) => {
eprintln!("Couldn't connect to journald: {}.", e);
std::process::exit(1);
}
}
return;
}
#[cfg(not(feature = "journald"))]
{
eprintln!("Journald support is not enabled in this build.");
std::process::exit(1);
}
}
tracing_subscriber::fmt() tracing_subscriber::fmt()
.with_writer(std::io::stderr) .with_writer(std::io::stderr)
.with_env_filter(env_filter) .with_env_filter(env_filter)
@ -281,7 +244,7 @@ async fn cli_command(opt: Opt) -> Result<(), Error> {
// Generate a temporary keypair for our RPC client // Generate a temporary keypair for our RPC client
let (_pk, sk) = sodiumoxide::crypto::sign::ed25519::gen_keypair(); let (_pk, sk) = sodiumoxide::crypto::sign::ed25519::gen_keypair();
let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, sk, None); let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, sk, None, None);
// Find and parse the address of the target host // Find and parse the address of the target host
let (id, addr, is_default_addr) = if let Some(h) = opt.rpc_host { let (id, addr, is_default_addr) = if let Some(h) = opt.rpc_host {

View file

@ -88,15 +88,6 @@ pub async fn launch_online_repair(
garage.block_manager.clone(), garage.block_manager.clone(),
)); ));
} }
RepairWhat::Aliases => {
info!("Repairing bucket aliases (foreground)");
garage.locked_helper().await.repair_aliases().await?;
}
RepairWhat::ClearResyncQueue => {
let garage = garage.clone();
tokio::task::spawn_blocking(move || garage.block_manager.resync.clear_resync_queue())
.await??
}
} }
Ok(()) Ok(())
} }

View file

@ -183,21 +183,10 @@ fn watch_shutdown_signal() -> watch::Receiver<bool> {
let mut sigterm = let mut sigterm =
signal(SignalKind::terminate()).expect("Failed to install SIGTERM handler"); signal(SignalKind::terminate()).expect("Failed to install SIGTERM handler");
let mut sighup = signal(SignalKind::hangup()).expect("Failed to install SIGHUP handler"); let mut sighup = signal(SignalKind::hangup()).expect("Failed to install SIGHUP handler");
loop { tokio::select! {
tokio::select! { _ = sigint.recv() => info!("Received SIGINT, shutting down."),
_ = sigint.recv() => { _ = sigterm.recv() => info!("Received SIGTERM, shutting down."),
info!("Received SIGINT, shutting down."); _ = sighup.recv() => info!("Received SIGHUP, shutting down."),
break
}
_ = sigterm.recv() => {
info!("Received SIGTERM, shutting down.");
break
}
_ = sighup.recv() => {
info!("Received SIGHUP, reload not supported.");
continue
}
}
} }
send_cancel.send(true).unwrap(); send_cancel.send(true).unwrap();
}); });

View file

@ -63,8 +63,6 @@ rpc_bind_addr = "127.0.0.1:{rpc_port}"
rpc_public_addr = "127.0.0.1:{rpc_port}" rpc_public_addr = "127.0.0.1:{rpc_port}"
rpc_secret = "{secret}" rpc_secret = "{secret}"
allow_punycode = true
[s3_api] [s3_api]
s3_region = "{region}" s3_region = "{region}"
api_bind_addr = "127.0.0.1:{s3_port}" api_bind_addr = "127.0.0.1:{s3_port}"

View file

@ -198,7 +198,6 @@ async fn test_precondition() {
); );
} }
let older_date = DateTime::from_secs_f64(last_modified.as_secs_f64() - 10.0); let older_date = DateTime::from_secs_f64(last_modified.as_secs_f64() - 10.0);
let same_date = DateTime::from_secs_f64(last_modified.as_secs_f64());
let newer_date = DateTime::from_secs_f64(last_modified.as_secs_f64() + 10.0); let newer_date = DateTime::from_secs_f64(last_modified.as_secs_f64() + 10.0);
{ {
let err = ctx let err = ctx
@ -213,18 +212,6 @@ async fn test_precondition() {
matches!(err, Err(SdkError::ServiceError(se)) if se.raw().status().as_u16() == 304) matches!(err, Err(SdkError::ServiceError(se)) if se.raw().status().as_u16() == 304)
); );
let err = ctx
.client
.get_object()
.bucket(&bucket)
.key(STD_KEY)
.if_modified_since(same_date)
.send()
.await;
assert!(
matches!(err, Err(SdkError::ServiceError(se)) if se.raw().status().as_u16() == 304)
);
let o = ctx let o = ctx
.client .client
.get_object() .get_object()
@ -249,17 +236,6 @@ async fn test_precondition() {
matches!(err, Err(SdkError::ServiceError(se)) if se.raw().status().as_u16() == 412) matches!(err, Err(SdkError::ServiceError(se)) if se.raw().status().as_u16() == 412)
); );
let o = ctx
.client
.get_object()
.bucket(&bucket)
.key(STD_KEY)
.if_unmodified_since(same_date)
.send()
.await
.unwrap();
assert_eq!(o.e_tag.as_ref().unwrap().as_str(), etag);
let o = ctx let o = ctx
.client .client
.get_object() .get_object()

View file

@ -533,118 +533,3 @@ async fn test_website_check_domain() {
}) })
); );
} }
#[tokio::test]
async fn test_website_puny() {
const BCKT_NAME: &str = "xn--pda.eu";
let ctx = common::context();
let bucket = ctx.create_bucket(BCKT_NAME);
let data = ByteStream::from_static(BODY);
ctx.client
.put_object()
.bucket(&bucket)
.key("index.html")
.body(data)
.send()
.await
.unwrap();
let client = Client::builder(TokioExecutor::new()).build_http();
let req = |suffix| {
Request::builder()
.method("GET")
.uri(format!("http://127.0.0.1:{}/", ctx.garage.web_port))
.header("Host", format!("{}{}", BCKT_NAME, suffix))
.body(Body::new(Bytes::new()))
.unwrap()
};
ctx.garage
.command()
.args(["bucket", "website", "--allow", BCKT_NAME])
.quiet()
.expect_success_status("Could not allow website on bucket");
let mut resp = client.request(req("")).await.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
assert_eq!(
resp.into_body().collect().await.unwrap().to_bytes(),
BODY.as_ref()
);
resp = client.request(req(".web.garage")).await.unwrap();
assert_eq!(resp.status(), StatusCode::OK);
assert_eq!(
resp.into_body().collect().await.unwrap().to_bytes(),
BODY.as_ref()
);
for bname in [
BCKT_NAME.to_string(),
format!("{BCKT_NAME}.web.garage"),
format!("{BCKT_NAME}.s3.garage"),
] {
let admin_req = || {
Request::builder()
.method("GET")
.uri(format!(
"http://127.0.0.1:{0}/check?domain={1}",
ctx.garage.admin_port, bname
))
.body(Body::new(Bytes::new()))
.unwrap()
};
let admin_resp = client.request(admin_req()).await.unwrap();
assert_eq!(admin_resp.status(), StatusCode::OK);
assert_eq!(
admin_resp.into_body().collect().await.unwrap().to_bytes(),
format!("Domain '{bname}' is managed by Garage").as_bytes()
);
}
}
#[tokio::test]
async fn test_website_object_not_found() {
const BCKT_NAME: &str = "not-found";
let ctx = common::context();
let _bucket = ctx.create_bucket(BCKT_NAME);
let client = Client::builder(TokioExecutor::new()).build_http();
let req = |suffix| {
Request::builder()
.method("GET")
.uri(format!("http://127.0.0.1:{}/", ctx.garage.web_port))
.header("Host", format!("{}{}", BCKT_NAME, suffix))
.body(Body::new(Bytes::new()))
.unwrap()
};
ctx.garage
.command()
.args(["bucket", "website", "--allow", BCKT_NAME])
.quiet()
.expect_success_status("Could not allow website on bucket");
let resp = client.request(req("")).await.unwrap();
assert_eq!(resp.status(), StatusCode::NOT_FOUND);
// the error we return by default are *not* xml
assert_eq!(
resp.headers().get(http::header::CONTENT_TYPE).unwrap(),
"text/html; charset=utf-8"
);
let result = String::from_utf8(
resp.into_body()
.collect()
.await
.unwrap()
.to_bytes()
.to_vec(),
)
.unwrap();
assert!(result.contains("not found"));
}

View file

@ -72,16 +72,6 @@ impl K2vClient {
.enable_http2() .enable_http2()
.build(); .build();
let client = HttpClient::builder(TokioExecutor::new()).build(connector); let client = HttpClient::builder(TokioExecutor::new()).build(connector);
Self::new_with_client(config, client)
}
/// Create a new K2V client with an external client.
/// Useful for example if you plan on creating many clients but you want to mutualize the
/// underlying thread pools & co.
pub fn new_with_client(
config: K2vClientConfig,
client: HttpClient<HttpsConnector<HttpConnector>, Body>,
) -> Result<Self, Error> {
let user_agent: std::borrow::Cow<str> = match &config.user_agent { let user_agent: std::borrow::Cow<str> = match &config.user_agent {
Some(ua) => ua.into(), Some(ua) => ua.into(),
None => format!("k2v/{}", env!("CARGO_PKG_VERSION")).into(), None => format!("k2v/{}", env!("CARGO_PKG_VERSION")).into(),

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_model" name = "garage_model"
version = "1.3.1" version = "1.1.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"
@ -24,7 +24,7 @@ garage_net.workspace = true
async-trait.workspace = true async-trait.workspace = true
blake2.workspace = true blake2.workspace = true
chrono.workspace = true chrono.workspace = true
thiserror.workspace = true err-derive.workspace = true
hex.workspace = true hex.workspace = true
http.workspace = true http.workspace = true
base64.workspace = true base64.workspace = true
@ -44,4 +44,3 @@ default = [ "lmdb", "sqlite" ]
k2v = [ "garage_util/k2v" ] k2v = [ "garage_util/k2v" ]
lmdb = [ "garage_db/lmdb" ] lmdb = [ "garage_db/lmdb" ]
sqlite = [ "garage_db/sqlite" ] sqlite = [ "garage_db/sqlite" ]
fjall = [ "garage_db/fjall" ]

View file

@ -22,10 +22,14 @@ mod v08 {
pub use v08::*; pub use v08::*;
impl BucketAlias { impl BucketAlias {
pub fn new(name: String, ts: u64, bucket_id: Option<Uuid>) -> Self { pub fn new(name: String, ts: u64, bucket_id: Option<Uuid>) -> Option<Self> {
BucketAlias { if !is_valid_bucket_name(&name) {
name, None
state: crdt::Lww::raw(ts, bucket_id), } else {
Some(BucketAlias {
name,
state: crdt::Lww::raw(ts, bucket_id),
})
} }
} }
@ -76,7 +80,7 @@ impl TableSchema for BucketAliasTable {
/// In the case of Garage, bucket names must not be hex-encoded /// In the case of Garage, bucket names must not be hex-encoded
/// 32 byte string, which is excluded thanks to the /// 32 byte string, which is excluded thanks to the
/// maximum length of 63 bytes given in the spec. /// maximum length of 63 bytes given in the spec.
pub fn is_valid_bucket_name(n: &str, puny: bool) -> bool { pub fn is_valid_bucket_name(n: &str) -> bool {
// Bucket names must be between 3 and 63 characters // Bucket names must be between 3 and 63 characters
n.len() >= 3 && n.len() <= 63 n.len() >= 3 && n.len() <= 63
// Bucket names must be composed of lowercase letters, numbers, // Bucket names must be composed of lowercase letters, numbers,
@ -88,9 +92,7 @@ pub fn is_valid_bucket_name(n: &str, puny: bool) -> bool {
// Bucket names must not be formatted as an IP address // Bucket names must not be formatted as an IP address
&& n.parse::<std::net::IpAddr>().is_err() && n.parse::<std::net::IpAddr>().is_err()
// Bucket names must not start with "xn--" // Bucket names must not start with "xn--"
&& (!n.starts_with("xn--") || puny) && !n.starts_with("xn--")
// We are a bit stricter, to properly restrict punycode in all labels
&& (!n.contains(".xn--") || puny)
// Bucket names must not end with "-s3alias" // Bucket names must not end with "-s3alias"
&& !n.ends_with("-s3alias") && !n.ends_with("-s3alias")
} }

View file

@ -116,17 +116,21 @@ impl Garage {
info!("Opening database..."); info!("Opening database...");
let db_engine = db::Engine::from_str(&config.db_engine) let db_engine = db::Engine::from_str(&config.db_engine)
.ok_or_message("Invalid `db_engine` value in configuration file")?; .ok_or_message("Invalid `db_engine` value in configuration file")?;
let db_path = db_engine.db_path(&config.metadata_dir); let mut db_path = config.metadata_dir.clone();
match db_engine {
db::Engine::Sqlite => {
db_path.push("db.sqlite");
}
db::Engine::Lmdb => {
db_path.push("db.lmdb");
}
}
let db_opt = db::OpenOpt { let db_opt = db::OpenOpt {
fsync: config.metadata_fsync, fsync: config.metadata_fsync,
lmdb_map_size: match config.lmdb_map_size { lmdb_map_size: match config.lmdb_map_size {
v if v == usize::default() => None, v if v == usize::default() => None,
v => Some(v), v => Some(v),
}, },
fjall_block_cache_size: match config.fjall_block_cache_size {
v if v == usize::default() => None,
v => Some(v),
},
}; };
let db = db::open_db(&db_path, db_engine, &db_opt) let db = db::open_db(&db_path, db_engine, &db_opt)
.ok_or_message("Unable to open metadata db")?; .ok_or_message("Unable to open metadata db")?;
@ -171,7 +175,13 @@ impl Garage {
// ---- admin tables ---- // ---- admin tables ----
info!("Initialize bucket_table..."); info!("Initialize bucket_table...");
let bucket_table = Table::new(BucketTable, control_rep_param.clone(), system.clone(), &db); let bucket_table = Table::new(
BucketTable,
control_rep_param.clone(),
system.clone(),
&db,
&config.experimental.merkle_backpressure,
);
info!("Initialize bucket_alias_table..."); info!("Initialize bucket_alias_table...");
let bucket_alias_table = Table::new( let bucket_alias_table = Table::new(
@ -179,9 +189,16 @@ impl Garage {
control_rep_param.clone(), control_rep_param.clone(),
system.clone(), system.clone(),
&db, &db,
&config.experimental.merkle_backpressure,
); );
info!("Initialize key_table_table..."); info!("Initialize key_table_table...");
let key_table = Table::new(KeyTable, control_rep_param, system.clone(), &db); let key_table = Table::new(
KeyTable,
control_rep_param,
system.clone(),
&db,
&config.experimental.merkle_backpressure,
);
// ---- S3 tables ---- // ---- S3 tables ----
info!("Initialize block_ref_table..."); info!("Initialize block_ref_table...");
@ -192,6 +209,7 @@ impl Garage {
meta_rep_param.clone(), meta_rep_param.clone(),
system.clone(), system.clone(),
&db, &db,
&config.experimental.merkle_backpressure,
); );
info!("Initialize version_table..."); info!("Initialize version_table...");
@ -202,10 +220,12 @@ impl Garage {
meta_rep_param.clone(), meta_rep_param.clone(),
system.clone(), system.clone(),
&db, &db,
&config.experimental.merkle_backpressure,
); );
info!("Initialize multipart upload counter table..."); info!("Initialize multipart upload counter table...");
let mpu_counter_table = IndexCounter::new(system.clone(), meta_rep_param.clone(), &db); let mpu_counter_table =
IndexCounter::new(system.clone(), meta_rep_param.clone(), &db, &config);
info!("Initialize multipart upload table..."); info!("Initialize multipart upload table...");
let mpu_table = Table::new( let mpu_table = Table::new(
@ -216,10 +236,12 @@ impl Garage {
meta_rep_param.clone(), meta_rep_param.clone(),
system.clone(), system.clone(),
&db, &db,
&config.experimental.merkle_backpressure,
); );
info!("Initialize object counter table..."); info!("Initialize object counter table...");
let object_counter_table = IndexCounter::new(system.clone(), meta_rep_param.clone(), &db); let object_counter_table =
IndexCounter::new(system.clone(), meta_rep_param.clone(), &db, &config);
info!("Initialize object_table..."); info!("Initialize object_table...");
#[allow(clippy::redundant_clone)] #[allow(clippy::redundant_clone)]
@ -232,6 +254,7 @@ impl Garage {
meta_rep_param.clone(), meta_rep_param.clone(),
system.clone(), system.clone(),
&db, &db,
&config.experimental.merkle_backpressure,
); );
info!("Load lifecycle worker state..."); info!("Load lifecycle worker state...");
@ -241,7 +264,7 @@ impl Garage {
// ---- K2V ---- // ---- K2V ----
#[cfg(feature = "k2v")] #[cfg(feature = "k2v")]
let k2v = GarageK2V::new(system.clone(), &db, meta_rep_param); let k2v = GarageK2V::new(system.clone(), &db, meta_rep_param, &config);
// ---- setup block refcount recalculation ---- // ---- setup block refcount recalculation ----
// this function can be used to fix inconsistencies in the RC table // this function can be used to fix inconsistencies in the RC table
@ -315,15 +338,15 @@ impl Garage {
Ok(()) Ok(())
} }
pub fn bucket_helper(&self) -> helper::bucket::BucketHelper<'_> { pub fn bucket_helper(&self) -> helper::bucket::BucketHelper {
helper::bucket::BucketHelper(self) helper::bucket::BucketHelper(self)
} }
pub fn key_helper(&self) -> helper::key::KeyHelper<'_> { pub fn key_helper(&self) -> helper::key::KeyHelper {
helper::key::KeyHelper(self) helper::key::KeyHelper(self)
} }
pub async fn locked_helper(&self) -> helper::locked::LockedHelper<'_> { pub async fn locked_helper(&self) -> helper::locked::LockedHelper {
let lock = self.bucket_lock.lock().await; let lock = self.bucket_lock.lock().await;
helper::locked::LockedHelper(self, Some(lock)) helper::locked::LockedHelper(self, Some(lock))
} }
@ -331,9 +354,14 @@ impl Garage {
#[cfg(feature = "k2v")] #[cfg(feature = "k2v")]
impl GarageK2V { impl GarageK2V {
fn new(system: Arc<System>, db: &db::Db, meta_rep_param: TableShardedReplication) -> Self { fn new(
system: Arc<System>,
db: &db::Db,
meta_rep_param: TableShardedReplication,
config: &Config,
) -> Self {
info!("Initialize K2V counter table..."); info!("Initialize K2V counter table...");
let counter_table = IndexCounter::new(system.clone(), meta_rep_param.clone(), db); let counter_table = IndexCounter::new(system.clone(), meta_rep_param.clone(), db, config);
info!("Initialize K2V subscription manager..."); info!("Initialize K2V subscription manager...");
let subscriptions = Arc::new(SubscriptionManager::new()); let subscriptions = Arc::new(SubscriptionManager::new());
@ -347,6 +375,7 @@ impl GarageK2V {
meta_rep_param, meta_rep_param,
system.clone(), system.clone(),
db, db,
&config.experimental.merkle_backpressure,
); );
info!("Initialize K2V RPC handler..."); info!("Initialize K2V RPC handler...");

View file

@ -1,24 +1,24 @@
use err_derive::Error;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use thiserror::Error;
use garage_util::error::Error as GarageError; use garage_util::error::Error as GarageError;
#[derive(Debug, Error, Serialize, Deserialize)] #[derive(Debug, Error, Serialize, Deserialize)]
pub enum Error { pub enum Error {
#[error("Internal error: {0}")] #[error(display = "Internal error: {}", _0)]
Internal(#[from] GarageError), Internal(#[error(source)] GarageError),
#[error("Bad request: {0}")] #[error(display = "Bad request: {}", _0)]
BadRequest(String), BadRequest(String),
/// Bucket name is not valid according to AWS S3 specs /// Bucket name is not valid according to AWS S3 specs
#[error("Invalid bucket name: {0}")] #[error(display = "Invalid bucket name: {}", _0)]
InvalidBucketName(String), InvalidBucketName(String),
#[error("Access key not found: {0}")] #[error(display = "Access key not found: {}", _0)]
NoSuchAccessKey(String), NoSuchAccessKey(String),
#[error("Bucket not found: {0}")] #[error(display = "Bucket not found: {}", _0)]
NoSuchBucket(String), NoSuchBucket(String),
} }

View file

@ -1,7 +1,3 @@
use std::collections::{HashMap, HashSet};
use garage_db as db;
use garage_util::crdt::*; use garage_util::crdt::*;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::{Error as GarageError, OkOrMessage}; use garage_util::error::{Error as GarageError, OkOrMessage};
@ -51,10 +47,6 @@ impl<'a> LockedHelper<'a> {
KeyHelper(self.0) KeyHelper(self.0)
} }
// ================================================
// global bucket aliases
// ================================================
/// Sets a new alias for a bucket in global namespace. /// Sets a new alias for a bucket in global namespace.
/// This function fails if: /// This function fails if:
/// - alias name is not valid according to S3 spec /// - alias name is not valid according to S3 spec
@ -65,7 +57,7 @@ impl<'a> LockedHelper<'a> {
bucket_id: Uuid, bucket_id: Uuid,
alias_name: &String, alias_name: &String,
) -> Result<(), Error> { ) -> Result<(), Error> {
if !is_valid_bucket_name(alias_name, self.0.config.allow_punycode) { if !is_valid_bucket_name(alias_name) {
return Err(Error::InvalidBucketName(alias_name.to_string())); return Err(Error::InvalidBucketName(alias_name.to_string()));
} }
@ -96,7 +88,8 @@ impl<'a> LockedHelper<'a> {
// writes are now done and all writes use timestamp alias_ts // writes are now done and all writes use timestamp alias_ts
let alias = match alias { let alias = match alias {
None => BucketAlias::new(alias_name.clone(), alias_ts, Some(bucket_id)), None => BucketAlias::new(alias_name.clone(), alias_ts, Some(bucket_id))
.ok_or_else(|| Error::InvalidBucketName(alias_name.clone()))?,
Some(mut a) => { Some(mut a) => {
a.state = Lww::raw(alias_ts, Some(bucket_id)); a.state = Lww::raw(alias_ts, Some(bucket_id));
a a
@ -187,14 +180,13 @@ impl<'a> LockedHelper<'a> {
.ok_or_else(|| Error::NoSuchBucket(alias_name.to_string()))?; .ok_or_else(|| Error::NoSuchBucket(alias_name.to_string()))?;
// Checks ok, remove alias // Checks ok, remove alias
let alias_ts = increment_logical_clock_2( let alias_ts = match bucket.state.as_option() {
alias.state.timestamp(), Some(bucket_state) => increment_logical_clock_2(
bucket alias.state.timestamp(),
.state bucket_state.aliases.get_timestamp(alias_name),
.as_option() ),
.map(|p| p.aliases.get_timestamp(alias_name)) None => increment_logical_clock(alias.state.timestamp()),
.unwrap_or(0), };
);
// ---- timestamp-ensured causality barrier ---- // ---- timestamp-ensured causality barrier ----
// writes are now done and all writes use timestamp alias_ts // writes are now done and all writes use timestamp alias_ts
@ -212,10 +204,6 @@ impl<'a> LockedHelper<'a> {
Ok(()) Ok(())
} }
// ================================================
// local bucket aliases
// ================================================
/// Sets a new alias for a bucket in the local namespace of a key. /// Sets a new alias for a bucket in the local namespace of a key.
/// This function fails if: /// This function fails if:
/// - alias name is not valid according to S3 spec /// - alias name is not valid according to S3 spec
@ -228,12 +216,14 @@ impl<'a> LockedHelper<'a> {
key_id: &String, key_id: &String,
alias_name: &String, alias_name: &String,
) -> Result<(), Error> { ) -> Result<(), Error> {
if !is_valid_bucket_name(alias_name, self.0.config.allow_punycode) { let key_helper = KeyHelper(self.0);
if !is_valid_bucket_name(alias_name) {
return Err(Error::InvalidBucketName(alias_name.to_string())); return Err(Error::InvalidBucketName(alias_name.to_string()));
} }
let mut bucket = self.bucket().get_existing_bucket(bucket_id).await?; let mut bucket = self.bucket().get_existing_bucket(bucket_id).await?;
let mut key = self.key().get_existing_key(key_id).await?; let mut key = key_helper.get_existing_key(key_id).await?;
let key_param = key.state.as_option_mut().unwrap(); let key_param = key.state.as_option_mut().unwrap();
@ -282,13 +272,23 @@ impl<'a> LockedHelper<'a> {
key_id: &String, key_id: &String,
alias_name: &String, alias_name: &String,
) -> Result<(), Error> { ) -> Result<(), Error> {
let mut bucket = self.bucket().get_existing_bucket(bucket_id).await?; let key_helper = KeyHelper(self.0);
let mut key = self.key().get_existing_key(key_id).await?;
let mut bucket = self.bucket().get_existing_bucket(bucket_id).await?;
let mut key = key_helper.get_existing_key(key_id).await?;
let key_p = key.state.as_option().unwrap();
let bucket_p = bucket.state.as_option_mut().unwrap(); let bucket_p = bucket.state.as_option_mut().unwrap();
if key_p.local_aliases.get(alias_name).cloned().flatten() != Some(bucket_id) { if key
.state
.as_option()
.unwrap()
.local_aliases
.get(alias_name)
.cloned()
.flatten()
!= Some(bucket_id)
{
return Err(GarageError::Message(format!( return Err(GarageError::Message(format!(
"Bucket {:?} does not have alias {} in namespace of key {}", "Bucket {:?} does not have alias {} in namespace of key {}",
bucket_id, alias_name, key_id bucket_id, alias_name, key_id
@ -305,17 +305,17 @@ impl<'a> LockedHelper<'a> {
.local_aliases .local_aliases
.items() .items()
.iter() .iter()
.any(|((k, n), _, active)| (*k != key.key_id || n != alias_name) && *active); .any(|((k, n), _, active)| *k == key.key_id && n == alias_name && *active);
if !has_other_global_aliases && !has_other_local_aliases { if !has_other_global_aliases && !has_other_local_aliases {
return Err(Error::BadRequest(format!("Bucket {} doesn't have other aliases, please delete it instead of just unaliasing.", alias_name))); return Err(Error::BadRequest(format!("Bucket {} doesn't have other aliases, please delete it instead of just unaliasing.", alias_name)));
} }
// Checks ok, remove alias // Checks ok, remove alias
let key_param = key.state.as_option_mut().unwrap();
let bucket_p_local_alias_key = (key.key_id.clone(), alias_name.clone()); let bucket_p_local_alias_key = (key.key_id.clone(), alias_name.clone());
let alias_ts = increment_logical_clock_2( let alias_ts = increment_logical_clock_2(
key_p.local_aliases.get_timestamp(alias_name), key_param.local_aliases.get_timestamp(alias_name),
bucket_p bucket_p
.local_aliases .local_aliases
.get_timestamp(&bucket_p_local_alias_key), .get_timestamp(&bucket_p_local_alias_key),
@ -324,8 +324,7 @@ impl<'a> LockedHelper<'a> {
// ---- timestamp-ensured causality barrier ---- // ---- timestamp-ensured causality barrier ----
// writes are now done and all writes use timestamp alias_ts // writes are now done and all writes use timestamp alias_ts
key.state.as_option_mut().unwrap().local_aliases = key_param.local_aliases = LwwMap::raw_item(alias_name.clone(), alias_ts, None);
LwwMap::raw_item(alias_name.clone(), alias_ts, None);
self.0.key_table.insert(&key).await?; self.0.key_table.insert(&key).await?;
bucket_p.local_aliases = LwwMap::raw_item(bucket_p_local_alias_key, alias_ts, false); bucket_p.local_aliases = LwwMap::raw_item(bucket_p_local_alias_key, alias_ts, false);
@ -334,68 +333,21 @@ impl<'a> LockedHelper<'a> {
Ok(()) Ok(())
} }
/// Ensures a bucket does not have a certain local alias.
/// Contrarily to unset_local_bucket_alias, this does not
/// fail on any condition other than:
/// - bucket cannot be found (its fine if it is in deleted state)
/// - key cannot be found (its fine if alias in key points to nothing
/// or to another bucket)
pub async fn purge_local_bucket_alias(
&self,
bucket_id: Uuid,
key_id: &String,
alias_name: &String,
) -> Result<(), Error> {
let mut bucket = self.bucket().get_internal_bucket(bucket_id).await?;
let mut key = self.key().get_internal_key(key_id).await?;
let bucket_p_local_alias_key = (key.key_id.clone(), alias_name.clone());
let alias_ts = increment_logical_clock_2(
key.state
.as_option()
.map(|p| p.local_aliases.get_timestamp(alias_name))
.unwrap_or(0),
bucket
.state
.as_option()
.map(|p| p.local_aliases.get_timestamp(&bucket_p_local_alias_key))
.unwrap_or(0),
);
// ---- timestamp-ensured causality barrier ----
// writes are now done and all writes use timestamp alias_ts
if let Some(kp) = key.state.as_option_mut() {
kp.local_aliases = LwwMap::raw_item(alias_name.clone(), alias_ts, None);
self.0.key_table.insert(&key).await?;
}
if let Some(bp) = bucket.state.as_option_mut() {
bp.local_aliases = LwwMap::raw_item(bucket_p_local_alias_key, alias_ts, false);
self.0.bucket_table.insert(&bucket).await?;
}
Ok(())
}
// ================================================
// permissions
// ================================================
/// Sets permissions for a key on a bucket. /// Sets permissions for a key on a bucket.
/// This function fails if: /// This function fails if:
/// - bucket or key cannot be found at all (its ok if they are in deleted state) /// - bucket or key cannot be found at all (its ok if they are in deleted state)
/// - bucket or key is in deleted state and we are trying to set /// - bucket or key is in deleted state and we are trying to set permissions other than "deny
/// permissions other than "deny all" /// all"
pub async fn set_bucket_key_permissions( pub async fn set_bucket_key_permissions(
&self, &self,
bucket_id: Uuid, bucket_id: Uuid,
key_id: &String, key_id: &String,
mut perm: BucketKeyPerm, mut perm: BucketKeyPerm,
) -> Result<(), Error> { ) -> Result<(), Error> {
let key_helper = KeyHelper(self.0);
let mut bucket = self.bucket().get_internal_bucket(bucket_id).await?; let mut bucket = self.bucket().get_internal_bucket(bucket_id).await?;
let mut key = self.key().get_internal_key(key_id).await?; let mut key = key_helper.get_internal_key(key_id).await?;
if let Some(bstate) = bucket.state.as_option() { if let Some(bstate) = bucket.state.as_option() {
if let Some(kp) = bstate.authorized_keys.get(key_id) { if let Some(kp) = bstate.authorized_keys.get(key_id) {
@ -432,20 +384,21 @@ impl<'a> LockedHelper<'a> {
Ok(()) Ok(())
} }
// ================================================ // ----
// keys
// ================================================
/// Deletes an API access key /// Deletes an API access key
pub async fn delete_key(&self, key: &mut Key) -> Result<(), Error> { pub async fn delete_key(&self, key: &mut Key) -> Result<(), Error> {
let state = key.state.as_option_mut().unwrap(); let state = key.state.as_option_mut().unwrap();
// --- done checking, now commit --- // --- done checking, now commit ---
// (the step at unset_local_bucket_alias will fail if a bucket
// does not have another alias, the deletion will be
// interrupted in the middle if that happens)
// 1. Delete local aliases // 1. Delete local aliases
for (alias, _, to) in state.local_aliases.items().iter() { for (alias, _, to) in state.local_aliases.items().iter() {
if let Some(bucket_id) = to { if let Some(bucket_id) = to {
self.purge_local_bucket_alias(*bucket_id, &key.key_id, alias) self.unset_local_bucket_alias(*bucket_id, &key.key_id, alias)
.await?; .await?;
} }
} }
@ -462,193 +415,4 @@ impl<'a> LockedHelper<'a> {
Ok(()) Ok(())
} }
// ================================================
// repair procedure
// ================================================
pub async fn repair_aliases(&self) -> Result<(), GarageError> {
self.0.db.transaction(|tx| {
info!("--- begin repair_aliases transaction ----");
// 1. List all non-deleted buckets, so that we can fix bad aliases
let mut all_buckets: HashSet<Uuid> = HashSet::new();
for item in tx.range::<&[u8], _>(&self.0.bucket_table.data.store, ..)? {
let bucket = self
.0
.bucket_table
.data
.decode_entry(&(item?.1))
.map_err(db::TxError::Abort)?;
if !bucket.is_deleted() {
all_buckets.insert(bucket.id);
}
}
info!("number of buckets: {}", all_buckets.len());
// 2. List all aliases declared in bucket_alias_table and key_table
// Take note of aliases that point to non-existing buckets
let mut global_aliases: HashMap<String, Uuid> = HashMap::new();
{
let mut delete_global = vec![];
for item in tx.range::<&[u8], _>(&self.0.bucket_alias_table.data.store, ..)? {
let mut alias = self
.0
.bucket_alias_table
.data
.decode_entry(&(item?.1))
.map_err(db::TxError::Abort)?;
if let Some(id) = alias.state.get() {
if all_buckets.contains(id) {
// keep aliases
global_aliases.insert(alias.name().to_string(), *id);
} else {
// delete alias
warn!(
"global alias: remove {} -> {:?} (bucket is deleted)",
alias.name(),
id
);
alias.state.update(None);
delete_global.push(alias);
}
}
}
info!("number of global aliases: {}", global_aliases.len());
info!("global alias table: {} entries fixed", delete_global.len());
for ga in delete_global {
debug!("Enqueue update to global alias table: {:?}", ga);
self.0.bucket_alias_table.queue_insert(tx, &ga)?;
}
}
let mut local_aliases: HashMap<(String, String), Uuid> = HashMap::new();
{
let mut delete_local = vec![];
for item in tx.range::<&[u8], _>(&self.0.key_table.data.store, ..)? {
let mut key = self
.0
.key_table
.data
.decode_entry(&(item?.1))
.map_err(db::TxError::Abort)?;
let Some(p) = key.state.as_option_mut() else {
continue;
};
let mut has_changes = false;
for (name, _, to) in p.local_aliases.items().to_vec() {
if let Some(id) = to {
if all_buckets.contains(&id) {
local_aliases.insert((key.key_id.clone(), name), id);
} else {
warn!(
"local alias: remove ({}, {}) -> {:?} (bucket is deleted)",
key.key_id, name, id
);
p.local_aliases.update_in_place(name, None);
has_changes = true;
}
}
}
if has_changes {
delete_local.push(key);
}
}
info!("number of local aliases: {}", local_aliases.len());
info!("key table: {} entries fixed", delete_local.len());
for la in delete_local {
debug!("Enqueue update to key table: {:?}", la);
self.0.key_table.queue_insert(tx, &la)?;
}
}
// 4. Reverse the alias maps to determine the aliases per-bucket
let mut bucket_global: HashMap<Uuid, Vec<String>> = HashMap::new();
let mut bucket_local: HashMap<Uuid, Vec<(String, String)>> = HashMap::new();
for (name, bucket) in global_aliases {
bucket_global.entry(bucket).or_default().push(name);
}
for ((key, name), bucket) in local_aliases {
bucket_local.entry(bucket).or_default().push((key, name));
}
// 5. Fix the bucket table to ensure consistency
let mut bucket_updates = vec![];
for item in tx.range::<&[u8], _>(&self.0.bucket_table.data.store, ..)? {
let bucket = self
.0
.bucket_table
.data
.decode_entry(&(item?.1))
.map_err(db::TxError::Abort)?;
let mut bucket2 = bucket.clone();
let Some(param) = bucket2.state.as_option_mut() else {
continue;
};
// fix global aliases
{
let ga = bucket_global.remove(&bucket.id).unwrap_or_default();
for (name, _, active) in param.aliases.items().to_vec() {
if active && !ga.contains(&name) {
warn!("bucket {:?}: remove global alias {}", bucket.id, name);
param.aliases.update_in_place(name, false);
}
}
for name in ga {
if param.aliases.get(&name).copied() != Some(true) {
warn!("bucket {:?}: add global alias {}", bucket.id, name);
param.aliases.update_in_place(name, true);
}
}
}
// fix local aliases
{
let la = bucket_local.remove(&bucket.id).unwrap_or_default();
for (pair, _, active) in param.local_aliases.items().to_vec() {
if active && !la.contains(&pair) {
warn!("bucket {:?}: remove local alias {:?}", bucket.id, pair);
param.local_aliases.update_in_place(pair, false);
}
}
for pair in la {
if param.local_aliases.get(&pair).copied() != Some(true) {
warn!("bucket {:?}: add local alias {:?}", bucket.id, pair);
param.local_aliases.update_in_place(pair, true);
}
}
}
if bucket2 != bucket {
bucket_updates.push(bucket2);
}
}
info!("bucket table: {} entries fixed", bucket_updates.len());
for b in bucket_updates {
debug!("Enqueue update to bucket table: {:?}", b);
self.0.bucket_table.queue_insert(tx, &b)?;
}
info!("--- end repair_aliases transaction ----");
Ok(())
})?;
info!("repair_aliases is done");
Ok(())
}
} }

View file

@ -10,6 +10,7 @@ use garage_db as db;
use garage_rpc::layout::LayoutHelper; use garage_rpc::layout::LayoutHelper;
use garage_rpc::system::System; use garage_rpc::system::System;
use garage_util::background::BackgroundRunner; use garage_util::background::BackgroundRunner;
use garage_util::config::Config;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::*; use garage_util::error::*;
use garage_util::migrate::Migrate; use garage_util::migrate::Migrate;
@ -173,6 +174,7 @@ impl<T: CountedItem> IndexCounter<T> {
system: Arc<System>, system: Arc<System>,
replication: TableShardedReplication, replication: TableShardedReplication,
db: &db::Db, db: &db::Db,
config: &Config,
) -> Arc<Self> { ) -> Arc<Self> {
Arc::new(Self { Arc::new(Self {
this_node: system.id, this_node: system.id,
@ -186,6 +188,7 @@ impl<T: CountedItem> IndexCounter<T> {
replication, replication,
system, system,
db, db,
&config.experimental.merkle_backpressure,
), ),
}) })
} }

View file

@ -121,13 +121,13 @@ impl Worker for LifecycleWorker {
mpu_aborted, mpu_aborted,
.. ..
} => { } => {
let n_objects = self.garage.object_table.data.store.approximate_len().ok(); let n_objects = self.garage.object_table.data.store.len().ok();
let progress = match n_objects { let progress = match n_objects {
Some(total) if total > 0 => format!( None => "...".to_string(),
Some(total) => format!(
"~{:.2}%", "~{:.2}%",
100. * std::cmp::min(*counter, total) as f32 / total as f32 100. * std::cmp::min(*counter, total) as f32 / total as f32
), ),
_ => "...".to_string(),
}; };
WorkerStatus { WorkerStatus {
progress: Some(progress), progress: Some(progress),

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_net" name = "garage_net"
version = "1.3.1" version = "1.1.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"
@ -30,7 +30,7 @@ rand.workspace = true
log.workspace = true log.workspace = true
arc-swap.workspace = true arc-swap.workspace = true
thiserror.workspace = true err-derive.workspace = true
bytes.workspace = true bytes.workspace = true
cfg-if.workspace = true cfg-if.workspace = true
@ -39,6 +39,7 @@ kuska-handshake.workspace = true
opentelemetry = { workspace = true, optional = true } opentelemetry = { workspace = true, optional = true }
opentelemetry-contrib = { workspace = true, optional = true } opentelemetry-contrib = { workspace = true, optional = true }
tracing.workspace = true
[dev-dependencies] [dev-dependencies]
pretty_env_logger.workspace = true pretty_env_logger.workspace = true

View file

@ -4,6 +4,7 @@ use std::pin::Pin;
use std::sync::atomic::{self, AtomicU32}; use std::sync::atomic::{self, AtomicU32};
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
use std::task::Poll; use std::task::Poll;
use tracing::*;
use arc_swap::ArcSwapOption; use arc_swap::ArcSwapOption;
use bytes::Bytes; use bytes::Bytes;
@ -14,7 +15,7 @@ use futures::Stream;
use kuska_handshake::async_std::{handshake_client, BoxStream}; use kuska_handshake::async_std::{handshake_client, BoxStream};
use tokio::net::TcpStream; use tokio::net::TcpStream;
use tokio::select; use tokio::select;
use tokio::sync::{mpsc, oneshot, watch}; use tokio::sync::{mpsc, oneshot, watch, Semaphore};
use tokio_util::compat::*; use tokio_util::compat::*;
#[cfg(feature = "telemetry")] #[cfg(feature = "telemetry")]
@ -25,6 +26,7 @@ use opentelemetry::{
#[cfg(feature = "telemetry")] #[cfg(feature = "telemetry")]
use opentelemetry_contrib::trace::propagator::binary::*; use opentelemetry_contrib::trace::propagator::binary::*;
use crate::endpoint::RpcInFlightLimiter;
use crate::error::*; use crate::error::*;
use crate::message::*; use crate::message::*;
use crate::netapp::*; use crate::netapp::*;
@ -41,6 +43,7 @@ pub(crate) struct ClientConn {
next_query_number: AtomicU32, next_query_number: AtomicU32,
inflight: Mutex<HashMap<RequestID, oneshot::Sender<ByteStream>>>, inflight: Mutex<HashMap<RequestID, oneshot::Sender<ByteStream>>>,
rpc_table_write_inflight_limiter: Option<Semaphore>,
} }
impl ClientConn { impl ClientConn {
@ -98,8 +101,14 @@ impl ClientConn {
next_query_number: AtomicU32::from(RequestID::default()), next_query_number: AtomicU32::from(RequestID::default()),
query_send: ArcSwapOption::new(Some(Arc::new(query_send))), query_send: ArcSwapOption::new(Some(Arc::new(query_send))),
inflight: Mutex::new(HashMap::new()), inflight: Mutex::new(HashMap::new()),
rpc_table_write_inflight_limiter: netapp.max_in_flight_table_write.map(Semaphore::new),
}); });
info!(
"Created conn with table write limit set to {}",
netapp.max_in_flight_table_write.unwrap_or(0)
);
netapp.connected_as_client(peer_id, conn.clone()); netapp.connected_as_client(peer_id, conn.clone());
let debug_name = format!("CLI {}", hex::encode(&peer_id[..8])); let debug_name = format!("CLI {}", hex::encode(&peer_id[..8]));
@ -144,10 +153,21 @@ impl ClientConn {
req: Req<T>, req: Req<T>,
path: &str, path: &str,
prio: RequestPriority, prio: RequestPriority,
limiter: RpcInFlightLimiter,
) -> Result<Resp<T>, Error> ) -> Result<Resp<T>, Error>
where where
T: Message, T: Message,
{ {
let _permit = match (limiter, &self.rpc_table_write_inflight_limiter) {
(RpcInFlightLimiter::TableWrite, Some(sem)) => {
info!(
"Available RPC table write slots: {}",
sem.available_permits()
);
Some(sem.acquire().await.unwrap())
}
_ => None,
};
let query_send = self.query_send.load_full().ok_or(Error::ConnectionClosed)?; let query_send = self.query_send.load_full().ok_or(Error::ConnectionClosed)?;
let id = self let id = self
@ -212,6 +232,7 @@ impl ClientConn {
let stream = Box::pin(canceller.for_stream(stream)); let stream = Box::pin(canceller.for_stream(stream));
let resp_enc = RespEnc::decode(stream).await?; let resp_enc = RespEnc::decode(stream).await?;
drop(_permit);
debug!("client: got response to request {} (path {})", id, path); debug!("client: got response to request {} (path {})", id, path);
Resp::from_enc(resp_enc) Resp::from_enc(resp_enc)
} }

View file

@ -57,6 +57,13 @@ where
} }
} }
#[derive(Debug, Copy, Clone, Default)]
pub enum RpcInFlightLimiter {
#[default]
NoLimit,
TableWrite,
}
// ---- // ----
/// This struct represents an endpoint for message of type `M`. /// This struct represents an endpoint for message of type `M`.
@ -114,6 +121,7 @@ where
target: &NodeID, target: &NodeID,
req: T, req: T,
prio: RequestPriority, prio: RequestPriority,
limiter: RpcInFlightLimiter,
) -> Result<Resp<M>, Error> ) -> Result<Resp<M>, Error>
where where
T: IntoReq<M>, T: IntoReq<M>,
@ -136,7 +144,10 @@ where
"Not connected: {}", "Not connected: {}",
hex::encode(&target[..8]) hex::encode(&target[..8])
))), ))),
Some(c) => c.call(req.into_req()?, self.path.as_str(), prio).await, Some(c) => {
c.call(req.into_req()?, self.path.as_str(), prio, limiter)
.await
}
} }
} }
} }
@ -149,8 +160,12 @@ where
target: &NodeID, target: &NodeID,
req: M, req: M,
prio: RequestPriority, prio: RequestPriority,
limiter: RpcInFlightLimiter,
) -> Result<<M as Message>::Response, Error> { ) -> Result<<M as Message>::Response, Error> {
Ok(self.call_streaming(target, req, prio).await?.into_msg()) Ok(self
.call_streaming(target, req, prio, limiter)
.await?
.into_msg())
} }
} }
@ -159,7 +174,7 @@ where
pub(crate) type DynEndpoint = Box<dyn GenericEndpoint + Send + Sync>; pub(crate) type DynEndpoint = Box<dyn GenericEndpoint + Send + Sync>;
pub(crate) trait GenericEndpoint { pub(crate) trait GenericEndpoint {
fn handle(&self, req_enc: ReqEnc, from: NodeID) -> BoxFuture<'_, Result<RespEnc, Error>>; fn handle(&self, req_enc: ReqEnc, from: NodeID) -> BoxFuture<Result<RespEnc, Error>>;
fn drop_handler(&self); fn drop_handler(&self);
fn clone_endpoint(&self) -> DynEndpoint; fn clone_endpoint(&self) -> DynEndpoint;
} }
@ -175,7 +190,7 @@ where
M: Message, M: Message,
H: StreamingEndpointHandler<M> + 'static, H: StreamingEndpointHandler<M> + 'static,
{ {
fn handle(&self, req_enc: ReqEnc, from: NodeID) -> BoxFuture<'_, Result<RespEnc, Error>> { fn handle(&self, req_enc: ReqEnc, from: NodeID) -> BoxFuture<Result<RespEnc, Error>> {
async move { async move {
match self.0.handler.load_full() { match self.0.handler.load_full() {
None => Err(Error::NoHandler), None => Err(Error::NoHandler),

View file

@ -1,49 +1,49 @@
use std::io; use std::io;
use err_derive::Error;
use log::error; use log::error;
use thiserror::Error;
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum Error { pub enum Error {
#[error("IO error: {0}")] #[error(display = "IO error: {}", _0)]
Io(#[from] io::Error), Io(#[error(source)] io::Error),
#[error("Messagepack encode error: {0}")] #[error(display = "Messagepack encode error: {}", _0)]
RMPEncode(#[from] rmp_serde::encode::Error), RMPEncode(#[error(source)] rmp_serde::encode::Error),
#[error("Messagepack decode error: {0}")] #[error(display = "Messagepack decode error: {}", _0)]
RMPDecode(#[from] rmp_serde::decode::Error), RMPDecode(#[error(source)] rmp_serde::decode::Error),
#[error("Tokio join error: {0}")] #[error(display = "Tokio join error: {}", _0)]
TokioJoin(#[from] tokio::task::JoinError), TokioJoin(#[error(source)] tokio::task::JoinError),
#[error("oneshot receive error: {0}")] #[error(display = "oneshot receive error: {}", _0)]
OneshotRecv(#[from] tokio::sync::oneshot::error::RecvError), OneshotRecv(#[error(source)] tokio::sync::oneshot::error::RecvError),
#[error("Handshake error: {0}")] #[error(display = "Handshake error: {}", _0)]
Handshake(#[from] kuska_handshake::async_std::Error), Handshake(#[error(source)] kuska_handshake::async_std::Error),
#[error("UTF8 error: {0}")] #[error(display = "UTF8 error: {}", _0)]
UTF8(#[from] std::string::FromUtf8Error), UTF8(#[error(source)] std::string::FromUtf8Error),
#[error("Framing protocol error")] #[error(display = "Framing protocol error")]
Framing, Framing,
#[error("Remote error ({0:?}): {1}")] #[error(display = "Remote error ({:?}): {}", _0, _1)]
Remote(io::ErrorKind, String), Remote(io::ErrorKind, String),
#[error("Request ID collision")] #[error(display = "Request ID collision")]
IdCollision, IdCollision,
#[error("{0}")] #[error(display = "{}", _0)]
Message(String), Message(String),
#[error("No handler / shutting down")] #[error(display = "No handler / shutting down")]
NoHandler, NoHandler,
#[error("Connection closed")] #[error(display = "Connection closed")]
ConnectionClosed, ConnectionClosed,
#[error("Version mismatch: {0}")] #[error(display = "Version mismatch: {}", _0)]
VersionMismatch(String), VersionMismatch(String),
} }

View file

@ -74,6 +74,8 @@ pub struct NetApp {
pub id: NodeID, pub id: NodeID,
/// Private key associated with our peer ID /// Private key associated with our peer ID
pub privkey: ed25519::SecretKey, pub privkey: ed25519::SecretKey,
/// Config related to netapp
pub(crate) max_in_flight_table_write: Option<usize>,
pub(crate) server_conns: RwLock<HashMap<NodeID, Arc<ServerConn>>>, pub(crate) server_conns: RwLock<HashMap<NodeID, Arc<ServerConn>>>,
pub(crate) client_conns: RwLock<HashMap<NodeID, Arc<ClientConn>>>, pub(crate) client_conns: RwLock<HashMap<NodeID, Arc<ClientConn>>>,
@ -101,6 +103,7 @@ impl NetApp {
netid: auth::Key, netid: auth::Key,
privkey: ed25519::SecretKey, privkey: ed25519::SecretKey,
bind_outgoing_to: Option<IpAddr>, bind_outgoing_to: Option<IpAddr>,
max_in_flight_table_write: Option<usize>,
) -> Arc<Self> { ) -> Arc<Self> {
let mut version_tag = [0u8; 16]; let mut version_tag = [0u8; 16];
version_tag[0..8].copy_from_slice(&u64::to_be_bytes(NETAPP_VERSION_TAG)[..]); version_tag[0..8].copy_from_slice(&u64::to_be_bytes(NETAPP_VERSION_TAG)[..]);
@ -114,6 +117,7 @@ impl NetApp {
netid, netid,
id, id,
privkey, privkey,
max_in_flight_table_write,
server_conns: RwLock::new(HashMap::new()), server_conns: RwLock::new(HashMap::new()),
client_conns: RwLock::new(HashMap::new()), client_conns: RwLock::new(HashMap::new()),
endpoints: RwLock::new(HashMap::new()), endpoints: RwLock::new(HashMap::new()),
@ -427,6 +431,7 @@ impl NetApp {
server_port, server_port,
}, },
PRIO_NORMAL, PRIO_NORMAL,
RpcInFlightLimiter::NoLimit,
) )
.await .await
.map(|_| ()) .map(|_| ())

View file

@ -406,7 +406,7 @@ impl PeeringManager {
ping_time ping_time
); );
let ping_response = select! { let ping_response = select! {
r = self.ping_endpoint.call(&id, ping_msg, PRIO_HIGH) => r, r = self.ping_endpoint.call(&id, ping_msg, PRIO_HIGH, RpcInFlightLimiter::NoLimit) => r,
_ = tokio::time::sleep(ping_timeout) => Err(Error::Message("Ping timeout".into())), _ = tokio::time::sleep(ping_timeout) => Err(Error::Message("Ping timeout".into())),
}; };
@ -458,7 +458,12 @@ impl PeeringManager {
let pex_message = PeerListMessage { list: peer_list }; let pex_message = PeerListMessage { list: peer_list };
match self match self
.peer_list_endpoint .peer_list_endpoint
.call(id, pex_message, PRIO_BACKGROUND) .call(
id,
pex_message,
PRIO_BACKGROUND,
RpcInFlightLimiter::NoLimit,
)
.await .await
{ {
Err(e) => warn!("Error doing peer exchange: {}", e), Err(e) => warn!("Error doing peer exchange: {}", e),

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_rpc" name = "garage_rpc"
version = "1.3.1" version = "1.1.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"
@ -33,7 +33,7 @@ async-trait.workspace = true
serde.workspace = true serde.workspace = true
serde_bytes.workspace = true serde_bytes.workspace = true
serde_json.workspace = true serde_json.workspace = true
thiserror = { workspace = true, optional = true } err-derive = { workspace = true, optional = true }
# newer version requires rust edition 2021 # newer version requires rust edition 2021
kube = { workspace = true, optional = true } kube = { workspace = true, optional = true }
@ -49,5 +49,5 @@ opentelemetry.workspace = true
[features] [features]
kubernetes-discovery = [ "kube", "k8s-openapi", "schemars" ] kubernetes-discovery = [ "kube", "k8s-openapi", "schemars" ]
consul-discovery = [ "reqwest", "thiserror" ] consul-discovery = [ "reqwest", "err-derive" ]
system-libs = [ "sodiumoxide/use-pkg-config" ] system-libs = [ "sodiumoxide/use-pkg-config" ]

View file

@ -3,8 +3,8 @@ use std::fs::File;
use std::io::Read; use std::io::Read;
use std::net::{IpAddr, SocketAddr}; use std::net::{IpAddr, SocketAddr};
use err_derive::Error;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use thiserror::Error;
use garage_net::NodeID; use garage_net::NodeID;
@ -219,12 +219,12 @@ impl ConsulDiscovery {
/// Regroup all Consul discovery errors /// Regroup all Consul discovery errors
#[derive(Debug, Error)] #[derive(Debug, Error)]
pub enum ConsulError { pub enum ConsulError {
#[error("IO error: {0}")] #[error(display = "IO error: {}", _0)]
Io(#[from] std::io::Error), Io(#[error(source)] std::io::Error),
#[error("HTTP error: {0}")] #[error(display = "HTTP error: {}", _0)]
Reqwest(#[from] reqwest::Error), Reqwest(#[error(source)] reqwest::Error),
#[error("Invalid Consul TLS configuration")] #[error(display = "Invalid Consul TLS configuration")]
InvalidTLSConfig, InvalidTLSConfig,
#[error("Token error: {0}")] #[error(display = "Token error: {}", _0)]
Token(#[from] reqwest::header::InvalidHeaderValue), Token(#[error(source)] reqwest::header::InvalidHeaderValue),
} }

View file

@ -229,11 +229,13 @@ impl LayoutManager {
} }
/// Save cluster layout data to disk /// Save cluster layout data to disk
async fn save_cluster_layout(&self) { async fn save_cluster_layout(&self) -> Result<(), Error> {
let layout = self.layout.read().unwrap().inner().clone(); let layout = self.layout.read().unwrap().inner().clone();
if let Err(e) = self.persist_cluster_layout.save_async(&layout).await { self.persist_cluster_layout
error!("Failed to save cluster_layout: {}", e); .save_async(&layout)
} .await
.expect("Cannot save current cluster layout");
Ok(())
} }
fn broadcast_update(self: &Arc<Self>, rpc: SystemRpc) { fn broadcast_update(self: &Arc<Self>, rpc: SystemRpc) {
@ -311,7 +313,7 @@ impl LayoutManager {
self.change_notify.notify_waiters(); self.change_notify.notify_waiters();
self.broadcast_update(SystemRpc::AdvertiseClusterLayout(new_layout)); self.broadcast_update(SystemRpc::AdvertiseClusterLayout(new_layout));
self.save_cluster_layout().await; self.save_cluster_layout().await?;
} }
Ok(SystemRpc::Ok) Ok(SystemRpc::Ok)
@ -326,7 +328,7 @@ impl LayoutManager {
if let Some(new_trackers) = self.merge_layout_trackers(trackers) { if let Some(new_trackers) = self.merge_layout_trackers(trackers) {
self.change_notify.notify_waiters(); self.change_notify.notify_waiters();
self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(new_trackers)); self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(new_trackers));
self.save_cluster_layout().await; self.save_cluster_layout().await?;
} }
Ok(SystemRpc::Ok) Ok(SystemRpc::Ok)

View file

@ -507,7 +507,7 @@ impl LayoutVersion {
g.compute_maximal_flow()?; g.compute_maximal_flow()?;
if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 { if g.get_flow_value()? < (NB_PARTITIONS * self.replication_factor) as i64 {
return Err(Error::Message( return Err(Error::Message(
"The storage capacity of the cluster is too small. It is \ "The storage capacity of he cluster is to small. It is \
impossible to store partitions of size 1." impossible to store partitions of size 1."
.into(), .into(),
)); ));

View file

@ -6,6 +6,7 @@ use std::time::Duration;
use futures::future::join_all; use futures::future::join_all;
use futures::stream::futures_unordered::FuturesUnordered; use futures::stream::futures_unordered::FuturesUnordered;
use futures::stream::StreamExt; use futures::stream::StreamExt;
use garage_net::endpoint::RpcInFlightLimiter;
use tokio::select; use tokio::select;
use opentelemetry::KeyValue; use opentelemetry::KeyValue;
@ -44,6 +45,8 @@ pub struct RequestStrategy<T> {
rs_timeout: Timeout, rs_timeout: Timeout,
/// Data to drop when everything completes /// Data to drop when everything completes
rs_drop_on_complete: T, rs_drop_on_complete: T,
/// RPC In Flight Limiter
rs_inflight_limiter: RpcInFlightLimiter,
} }
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
@ -61,6 +64,7 @@ impl Clone for RequestStrategy<()> {
rs_priority: self.rs_priority, rs_priority: self.rs_priority,
rs_timeout: self.rs_timeout, rs_timeout: self.rs_timeout,
rs_drop_on_complete: (), rs_drop_on_complete: (),
rs_inflight_limiter: self.rs_inflight_limiter,
} }
} }
} }
@ -74,6 +78,7 @@ impl RequestStrategy<()> {
rs_priority: prio, rs_priority: prio,
rs_timeout: Timeout::Default, rs_timeout: Timeout::Default,
rs_drop_on_complete: (), rs_drop_on_complete: (),
rs_inflight_limiter: RpcInFlightLimiter::NoLimit,
} }
} }
/// Add an item to be dropped on completion /// Add an item to be dropped on completion
@ -84,6 +89,7 @@ impl RequestStrategy<()> {
rs_priority: self.rs_priority, rs_priority: self.rs_priority,
rs_timeout: self.rs_timeout, rs_timeout: self.rs_timeout,
rs_drop_on_complete: drop_on_complete, rs_drop_on_complete: drop_on_complete,
rs_inflight_limiter: RpcInFlightLimiter::NoLimit,
} }
} }
} }
@ -109,6 +115,10 @@ impl<T> RequestStrategy<T> {
self.rs_timeout = Timeout::Custom(timeout); self.rs_timeout = Timeout::Custom(timeout);
self self
} }
pub fn with_write_limiter(mut self) -> Self {
self.rs_inflight_limiter = RpcInFlightLimiter::TableWrite;
self
}
/// Extract drop_on_complete item /// Extract drop_on_complete item
fn extract_drop_on_complete(self) -> (RequestStrategy<()>, T) { fn extract_drop_on_complete(self) -> (RequestStrategy<()>, T) {
( (
@ -118,6 +128,7 @@ impl<T> RequestStrategy<T> {
rs_priority: self.rs_priority, rs_priority: self.rs_priority,
rs_timeout: self.rs_timeout, rs_timeout: self.rs_timeout,
rs_drop_on_complete: (), rs_drop_on_complete: (),
rs_inflight_limiter: self.rs_inflight_limiter,
}, },
self.rs_drop_on_complete, self.rs_drop_on_complete,
) )
@ -185,7 +196,7 @@ impl RpcHelper {
let node_id = to.into(); let node_id = to.into();
let rpc_call = endpoint let rpc_call = endpoint
.call_streaming(&node_id, msg, strat.rs_priority) .call_streaming(&node_id, msg, strat.rs_priority, strat.rs_inflight_limiter)
.with_context(Context::current_with_span(span)) .with_context(Context::current_with_span(span))
.record_duration(&self.0.metrics.rpc_duration, &metric_tags); .record_duration(&self.0.metrics.rpc_duration, &metric_tags);

View file

@ -21,7 +21,7 @@ use garage_net::{NetApp, NetworkKey, NodeID, NodeKey};
#[cfg(feature = "kubernetes-discovery")] #[cfg(feature = "kubernetes-discovery")]
use garage_util::config::KubernetesDiscoveryConfig; use garage_util::config::KubernetesDiscoveryConfig;
use garage_util::config::{Config, DataDirEnum}; use garage_util::config::{Config, DataDirEnum, RpcInFlightLimiterEnum};
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::*; use garage_util::error::*;
use garage_util::persister::Persister; use garage_util::persister::Persister;
@ -256,7 +256,17 @@ impl System {
let bind_outgoing_to = Some(config) let bind_outgoing_to = Some(config)
.filter(|x| x.rpc_bind_outgoing) .filter(|x| x.rpc_bind_outgoing)
.map(|x| x.rpc_bind_addr.ip()); .map(|x| x.rpc_bind_addr.ip());
let netapp = NetApp::new(GARAGE_VERSION_TAG, network_key, node_key, bind_outgoing_to); let maybe_max_table_write = match &config.experimental.rpc_in_flight_limiters {
RpcInFlightLimiterEnum::None => None,
RpcInFlightLimiterEnum::FixedSize(v) => Some(v.max_table_write),
};
let netapp = NetApp::new(
GARAGE_VERSION_TAG,
network_key,
node_key,
bind_outgoing_to,
maybe_max_table_write,
);
let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into()); let system_endpoint = netapp.endpoint(SYSTEM_RPC_PATH.into());
// ---- setup netapp public listener and full mesh peering strategy ---- // ---- setup netapp public listener and full mesh peering strategy ----

View file

@ -1,6 +1,6 @@
[package] [package]
name = "garage_table" name = "garage_table"
version = "1.3.1" version = "1.1.0"
authors = ["Alex Auvolat <alex@adnab.me>"] authors = ["Alex Auvolat <alex@adnab.me>"]
edition = "2018" edition = "2018"
license = "AGPL-3.0" license = "AGPL-3.0"

View file

@ -3,10 +3,12 @@ use std::convert::TryInto;
use std::sync::Arc; use std::sync::Arc;
use serde_bytes::ByteBuf; use serde_bytes::ByteBuf;
use tokio::sync::Notify; use tokio::sync::SemaphorePermit;
use tokio::sync::{Notify, Semaphore};
use garage_db as db; use garage_db as db;
use garage_util::config::MerkleBackpressureEnum;
use garage_util::data::*; use garage_util::data::*;
use garage_util::error::*; use garage_util::error::*;
use garage_util::migrate::Migrate; use garage_util::migrate::Migrate;
@ -20,6 +22,67 @@ use crate::replication::*;
use crate::schema::*; use crate::schema::*;
use crate::util::*; use crate::util::*;
pub(crate) struct MerkleTodo {
merkle_todo: db::Tree,
merkle_todo_notify: Notify,
merkle_todo_bounded_queue: Option<Arc<Semaphore>>,
}
impl Clone for MerkleTodo {
fn clone(&self) -> Self {
Self {
merkle_todo: self.merkle_todo.clone(),
merkle_todo_notify: Notify::new(),
merkle_todo_bounded_queue: self.merkle_todo_bounded_queue.clone(),
}
}
}
impl MerkleTodo {
fn new<F: TableSchema>(db: &db::Db, config: &MerkleBackpressureEnum) -> Self {
let merkle_todo = db
.open_tree(format!("{}:merkle_todo", F::TABLE_NAME))
.expect("Unable to open DB Merkle TODO tree");
let merkle_todo_bounded_queue = match config {
MerkleBackpressureEnum::None => None,
MerkleBackpressureEnum::FixedQueue(p) => {
Some(Arc::new(Semaphore::new(p.max_queue_size)))
}
};
Self {
merkle_todo,
merkle_todo_notify: Notify::new(),
merkle_todo_bounded_queue,
}
}
pub(crate) fn len(&self) -> Result<usize, db::Error> {
self.merkle_todo.len()
}
pub(crate) async fn with_db<F: FnOnce(&db::Tree, SemaphorePermit)>(&self, f: F) {
let bounded = self
.merkle_todo_bounded_queue
.clone()
.unwrap_or(Arc::new(Semaphore::new(1)));
let permit = bounded.acquire().await.unwrap();
f(&self.merkle_todo, permit);
}
pub(crate) fn appended(&self, permit: SemaphorePermit) {
permit.forget();
self.merkle_todo_notify.notify_one();
}
pub(crate) fn processed(&self) {
let bounded = self
.merkle_todo_bounded_queue
.clone()
.unwrap_or(Arc::new(Semaphore::new(1)));
bounded.add_permits(1);
}
}
pub struct TableData<F: TableSchema, R: TableReplication> { pub struct TableData<F: TableSchema, R: TableReplication> {
system: Arc<System>, system: Arc<System>,
@ -29,8 +92,7 @@ pub struct TableData<F: TableSchema, R: TableReplication> {
pub store: db::Tree, pub store: db::Tree,
pub(crate) merkle_tree: db::Tree, pub(crate) merkle_tree: db::Tree,
pub(crate) merkle_todo: db::Tree, pub(crate) merkle_todo: MerkleTodo,
pub(crate) merkle_todo_notify: Notify,
pub(crate) insert_queue: db::Tree, pub(crate) insert_queue: db::Tree,
pub(crate) insert_queue_notify: Arc<Notify>, pub(crate) insert_queue_notify: Arc<Notify>,
@ -38,10 +100,18 @@ pub struct TableData<F: TableSchema, R: TableReplication> {
pub(crate) gc_todo: db::Tree, pub(crate) gc_todo: db::Tree,
pub(crate) metrics: TableMetrics, pub(crate) metrics: TableMetrics,
pub(crate) config: MerkleBackpressureEnum,
} }
impl<F: TableSchema, R: TableReplication> TableData<F, R> { impl<F: TableSchema, R: TableReplication> TableData<F, R> {
pub fn new(system: Arc<System>, instance: F, replication: R, db: &db::Db) -> Arc<Self> { pub fn new(
system: Arc<System>,
instance: F,
replication: R,
db: &db::Db,
config: &MerkleBackpressureEnum,
) -> Arc<Self> {
let store = db let store = db
.open_tree(format!("{}:table", F::TABLE_NAME)) .open_tree(format!("{}:table", F::TABLE_NAME))
.expect("Unable to open DB tree"); .expect("Unable to open DB tree");
@ -49,9 +119,8 @@ impl<F: TableSchema, R: TableReplication> TableData<F, R> {
let merkle_tree = db let merkle_tree = db
.open_tree(format!("{}:merkle_tree", F::TABLE_NAME)) .open_tree(format!("{}:merkle_tree", F::TABLE_NAME))
.expect("Unable to open DB Merkle tree tree"); .expect("Unable to open DB Merkle tree tree");
let merkle_todo = db
.open_tree(format!("{}:merkle_todo", F::TABLE_NAME)) let merkle_todo = MerkleTodo::new::<F>(db, config);
.expect("Unable to open DB Merkle TODO tree");
let insert_queue = db let insert_queue = db
.open_tree(format!("{}:insert_queue", F::TABLE_NAME)) .open_tree(format!("{}:insert_queue", F::TABLE_NAME))
@ -76,11 +145,11 @@ impl<F: TableSchema, R: TableReplication> TableData<F, R> {
store, store,
merkle_tree, merkle_tree,
merkle_todo, merkle_todo,
merkle_todo_notify: Notify::new(),
insert_queue, insert_queue,
insert_queue_notify: Arc::new(Notify::new()), insert_queue_notify: Arc::new(Notify::new()),
gc_todo, gc_todo,
metrics, metrics,
config: config.clone(),
}) })
} }
@ -167,6 +236,8 @@ impl<F: TableSchema, R: TableReplication> TableData<F, R> {
// - When an entry is modified or deleted, add it to the merkle updater's todo list. // - When an entry is modified or deleted, add it to the merkle updater's todo list.
// This has to be done atomically with the modification for the merkle updater // This has to be done atomically with the modification for the merkle updater
// to maintain consistency. The merkle updater must then be notified with todo_notify. // to maintain consistency. The merkle updater must then be notified with todo_notify.
// Also to avoid overloading the merkle updater, you need to sleep a given amount of
// time to enable backpressure (ie. slow down clients).
// - When an entry is updated to be a tombstone, add it to the gc_todo tree // - When an entry is updated to be a tombstone, add it to the gc_todo tree
pub(crate) fn update_many<T: Borrow<ByteBuf>>(&self, entries: &[T]) -> Result<(), Error> { pub(crate) fn update_many<T: Borrow<ByteBuf>>(&self, entries: &[T]) -> Result<(), Error> {
@ -201,6 +272,7 @@ impl<F: TableSchema, R: TableReplication> TableData<F, R> {
) -> Result<Option<F::E>, Error> { ) -> Result<Option<F::E>, Error> {
let tree_key = self.tree_key(partition_key, sort_key); let tree_key = self.tree_key(partition_key, sort_key);
// transaction begins
let changed = self.store.db().transaction(|tx| { let changed = self.store.db().transaction(|tx| {
let (old_entry, old_bytes, new_entry) = match tx.get(&self.store, &tree_key)? { let (old_entry, old_bytes, new_entry) = match tx.get(&self.store, &tree_key)? {
Some(old_bytes) => { Some(old_bytes) => {
@ -238,31 +310,44 @@ impl<F: TableSchema, R: TableReplication> TableData<F, R> {
Ok(None) Ok(None)
} }
})?; })?;
// transaction ends
if let Some((new_entry, new_bytes_hash)) = changed { // early return if nothing changed
self.metrics.internal_update_counter.add(1); let (new_entry, new_bytes_hash) = match changed {
Some((e, b)) => (e, b),
let is_tombstone = new_entry.is_tombstone(); None => {
self.merkle_todo_notify.notify_one(); let maybe_bound = self.merkle_todo_bounded_queue.clone();
if is_tombstone { if let Some(b) = &maybe_bound {
// We are only responsible for GC'ing this item if we are the b.add_permits(1);
// "leader" of the partition, i.e. the first node in the
// set of nodes that replicates this partition.
// This avoids GC loops and does not change the termination properties
// of the GC algorithm, as in all cases GC is suspended if
// any node of the partition is unavailable.
let pk_hash = Hash::try_from(&tree_key[..32]).unwrap();
// TODO: this probably breaks when the layout changes
let nodes = self.replication.storage_nodes(&pk_hash);
if nodes.first() == Some(&self.system.id) {
GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?;
} }
return Ok(None);
} }
};
Ok(Some(new_entry)) // Handle GC in case of tombstone
} else { let is_tombstone = new_entry.is_tombstone();
Ok(None) if is_tombstone {
// We are only responsible for GC'ing this item if we are the
// "leader" of the partition, i.e. the first node in the
// set of nodes that replicates this partition.
// This avoids GC loops and does not change the termination properties
// of the GC algorithm, as in all cases GC is suspended if
// any node of the partition is unavailable.
let pk_hash = Hash::try_from(&tree_key[..32]).unwrap();
// TODO: this probably breaks when the layout changes
let nodes = self.replication.storage_nodes(&pk_hash);
if nodes.first() == Some(&self.system.id) {
GcTodoEntry::new(tree_key, new_bytes_hash).save(&self.gc_todo)?;
}
} }
// Collect metrics
self.metrics.internal_update_counter.add(1);
// Synchronize with the Merkle Worker
self.merkle_todo_notify.notify_one(); // Wake-up it
Ok(Some(new_entry))
} }
pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> { pub(crate) fn delete_if_equal(self: &Arc<Self>, k: &[u8], v: &[u8]) -> Result<bool, Error> {
@ -282,10 +367,16 @@ impl<F: TableSchema, R: TableReplication> TableData<F, R> {
_ => Ok(false), _ => Ok(false),
})?; })?;
if removed { if !removed {
self.metrics.internal_delete_counter.add(1); let maybe_bound = self.merkle_todo_bounded_queue.clone();
self.merkle_todo_notify.notify_one(); if let Some(b) = &maybe_bound {
b.add_permits(1);
}
return Ok(false);
} }
self.metrics.internal_delete_counter.add(1);
self.merkle_todo_notify.notify_one();
Ok(removed) Ok(removed)
} }
@ -310,11 +401,18 @@ impl<F: TableSchema, R: TableReplication> TableData<F, R> {
_ => Ok(false), _ => Ok(false),
})?; })?;
if removed { if !removed {
self.metrics.internal_delete_counter.add(1); let maybe_bound = self.merkle_todo_bounded_queue.clone();
self.merkle_todo_notify.notify_one(); if let Some(b) = &maybe_bound {
b.add_permits(1);
}
return Ok(false);
} }
Ok(removed)
self.metrics.internal_delete_counter.add(1);
self.merkle_todo_notify.notify_one();
Ok(true)
} }
// ---- Insert queue functions ---- // ---- Insert queue functions ----
@ -367,7 +465,7 @@ impl<F: TableSchema, R: TableReplication> TableData<F, R> {
} }
} }
pub fn gc_todo_approximate_len(&self) -> Result<usize, Error> { pub fn gc_todo_len(&self) -> Result<usize, Error> {
Ok(self.gc_todo.approximate_len()?) Ok(self.gc_todo.len()?)
} }
} }

View file

@ -262,7 +262,8 @@ impl<F: TableSchema, R: TableReplication> TableGc<F, R> {
// GC has been successful for all of these entries. // GC has been successful for all of these entries.
// We now remove them all from our local table and from the GC todo list. // We now remove them all from our local table and from the GC todo list.
for item in items { for item in items {
self.data let _is_removed = self
.data
.delete_if_equal_hash(&item.key[..], item.value_hash) .delete_if_equal_hash(&item.key[..], item.value_hash)
.err_context("GC: local delete tombstones")?; .err_context("GC: local delete tombstones")?;
item.remove_if_equal(&self.data.gc_todo) item.remove_if_equal(&self.data.gc_todo)
@ -275,14 +276,21 @@ impl<F: TableSchema, R: TableReplication> TableGc<F, R> {
impl<F: TableSchema, R: TableReplication> EndpointHandler<GcRpc> for TableGc<F, R> { impl<F: TableSchema, R: TableReplication> EndpointHandler<GcRpc> for TableGc<F, R> {
async fn handle(self: &Arc<Self>, message: &GcRpc, _from: NodeID) -> Result<GcRpc, Error> { async fn handle(self: &Arc<Self>, message: &GcRpc, _from: NodeID) -> Result<GcRpc, Error> {
let maybe_bounded = self.data.merkle_todo_bounded_queue.clone();
match message { match message {
GcRpc::Update(items) => { GcRpc::Update(items) => {
if let Some(b) = maybe_bounded {
b.acquire_many(items.len() as u32).await.unwrap().forget();
}
self.data.update_many(items)?; self.data.update_many(items)?;
Ok(GcRpc::Ok) Ok(GcRpc::Ok)
} }
GcRpc::DeleteIfEqualHash(items) => { GcRpc::DeleteIfEqualHash(items) => {
if let Some(b) = maybe_bounded {
b.acquire_many(items.len() as u32).await.unwrap().forget();
}
for (key, vhash) in items.iter() { for (key, vhash) in items.iter() {
self.data.delete_if_equal_hash(&key[..], *vhash)?; let _is_removed = self.data.delete_if_equal_hash(&key[..], *vhash)?;
} }
Ok(GcRpc::Ok) Ok(GcRpc::Ok)
} }
@ -313,7 +321,7 @@ impl<F: TableSchema, R: TableReplication> Worker for GcWorker<F, R> {
fn status(&self) -> WorkerStatus { fn status(&self) -> WorkerStatus {
WorkerStatus { WorkerStatus {
queue_length: Some(self.gc.data.gc_todo_approximate_len().unwrap_or(0) as u64), queue_length: Some(self.gc.data.gc_todo_len().unwrap_or(0) as u64),
..Default::default() ..Default::default()
} }
} }
@ -329,7 +337,6 @@ impl<F: TableSchema, R: TableReplication> Worker for GcWorker<F, R> {
} }
async fn wait_for_work(&mut self) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
tokio::time::sleep(self.wait_delay).await;
WorkerState::Busy WorkerState::Busy
} }
} }

View file

@ -9,6 +9,7 @@ use tokio::sync::watch;
use garage_db as db; use garage_db as db;
use garage_util::background::*; use garage_util::background::*;
use garage_util::config::MerkleBackpressureEnum;
use garage_util::data::*; use garage_util::data::*;
use garage_util::encode::{nonversioned_decode, nonversioned_encode}; use garage_util::encode::{nonversioned_decode, nonversioned_encode};
use garage_util::error::Error; use garage_util::error::Error;
@ -70,6 +71,15 @@ impl<F: TableSchema, R: TableReplication> MerkleUpdater<F, R> {
pub(crate) fn new(data: Arc<TableData<F, R>>) -> Arc<Self> { pub(crate) fn new(data: Arc<TableData<F, R>>) -> Arc<Self> {
let empty_node_hash = blake2sum(&nonversioned_encode(&MerkleNode::Empty).unwrap()[..]); let empty_node_hash = blake2sum(&nonversioned_encode(&MerkleNode::Empty).unwrap()[..]);
// @FIXME: move in worker
match &data.config {
MerkleBackpressureEnum::None => info!("Merkle Backpressure is not activated"),
MerkleBackpressureEnum::FixedQueue(v) => info!(
"Merkle backpressure with a fixed queue size (qlen={}) is activated.",
v.max_queue_size
),
}
Arc::new(Self { Arc::new(Self {
data, data,
empty_node_hash, empty_node_hash,
@ -125,6 +135,11 @@ impl<F: TableSchema, R: TableReplication> MerkleUpdater<F, R> {
k k
); );
} }
let maybe_bound = self.data.merkle_todo_bounded_queue.clone();
if let Some(b) = &maybe_bound {
b.add_permits(1);
}
Ok(()) Ok(())
} }
@ -287,12 +302,12 @@ impl<F: TableSchema, R: TableReplication> MerkleUpdater<F, R> {
MerkleNode::decode_opt(&ent) MerkleNode::decode_opt(&ent)
} }
pub fn merkle_tree_approximate_len(&self) -> Result<usize, Error> { pub fn merkle_tree_len(&self) -> Result<usize, Error> {
Ok(self.data.merkle_tree.approximate_len()?) Ok(self.data.merkle_tree.len()?)
} }
pub fn todo_approximate_len(&self) -> Result<usize, Error> { pub fn todo_len(&self) -> Result<usize, Error> {
Ok(self.data.merkle_todo.approximate_len()?) Ok(self.data.merkle_todo.len()?)
} }
} }
@ -306,7 +321,7 @@ impl<F: TableSchema, R: TableReplication> Worker for MerkleWorker<F, R> {
fn status(&self) -> WorkerStatus { fn status(&self) -> WorkerStatus {
WorkerStatus { WorkerStatus {
queue_length: Some(self.0.todo_approximate_len().unwrap_or(0) as u64), queue_length: Some(self.0.todo_len().unwrap_or(0) as u64),
..Default::default() ..Default::default()
} }
} }

View file

@ -1,12 +1,16 @@
use opentelemetry::{global, metrics::*, KeyValue}; use opentelemetry::{global, metrics::*, KeyValue};
use std::convert::TryInto;
use garage_db as db; use garage_db as db;
use crate::data::MerkleTodo;
/// TableMetrics reference all counter used for metrics /// TableMetrics reference all counter used for metrics
pub struct TableMetrics { pub struct TableMetrics {
pub(crate) _table_size: ValueObserver<u64>, pub(crate) _table_size: ValueObserver<u64>,
pub(crate) _merkle_tree_size: ValueObserver<u64>, pub(crate) _merkle_tree_size: ValueObserver<u64>,
pub(crate) _merkle_todo_len: ValueObserver<u64>, pub(crate) _merkle_todo_len: ValueObserver<u64>,
pub(crate) _merkle_todo_bounded_queue_free: ValueObserver<u64>,
pub(crate) _gc_todo_len: ValueObserver<u64>, pub(crate) _gc_todo_len: ValueObserver<u64>,
pub(crate) get_request_counter: BoundCounter<u64>, pub(crate) get_request_counter: BoundCounter<u64>,
@ -25,7 +29,7 @@ impl TableMetrics {
table_name: &'static str, table_name: &'static str,
store: db::Tree, store: db::Tree,
merkle_tree: db::Tree, merkle_tree: db::Tree,
merkle_todo: db::Tree, merkle_todo: MerkleTodo,
gc_todo: db::Tree, gc_todo: db::Tree,
) -> Self { ) -> Self {
let meter = global::meter(table_name); let meter = global::meter(table_name);
@ -34,7 +38,7 @@ impl TableMetrics {
.u64_value_observer( .u64_value_observer(
"table.size", "table.size",
move |observer| { move |observer| {
if let Ok(value) = store.approximate_len() { if let Ok(value) = store.len() {
observer.observe( observer.observe(
value as u64, value as u64,
&[KeyValue::new("table_name", table_name)], &[KeyValue::new("table_name", table_name)],
@ -48,7 +52,7 @@ impl TableMetrics {
.u64_value_observer( .u64_value_observer(
"table.merkle_tree_size", "table.merkle_tree_size",
move |observer| { move |observer| {
if let Ok(value) = merkle_tree.approximate_len() { if let Ok(value) = merkle_tree.len() {
observer.observe( observer.observe(
value as u64, value as u64,
&[KeyValue::new("table_name", table_name)], &[KeyValue::new("table_name", table_name)],
@ -62,7 +66,7 @@ impl TableMetrics {
.u64_value_observer( .u64_value_observer(
"table.merkle_updater_todo_queue_length", "table.merkle_updater_todo_queue_length",
move |observer| { move |observer| {
if let Ok(v) = merkle_todo.approximate_len() { if let Ok(v) = merkle_todo.len() {
observer.observe( observer.observe(
v as u64, v as u64,
&[KeyValue::new("table_name", table_name)], &[KeyValue::new("table_name", table_name)],
@ -72,11 +76,25 @@ impl TableMetrics {
) )
.with_description("Merkle tree updater TODO queue length") .with_description("Merkle tree updater TODO queue length")
.init(), .init(),
_merkle_todo_bounded_queue_free: meter
.u64_value_observer(
"table.merkle_todo_bounded_queue_free",
move |observer| {
let maybe_bounded = merkle_todo_bounded_queue.clone();
let free: u64 = match &maybe_bounded {
Some(v) => v.available_permits().try_into().unwrap(),
None => 0,
};
observer.observe(free, &[KeyValue::new("table_name", table_name)])
}
)
.with_description("Merkle TODO queue free slots")
.init(),
_gc_todo_len: meter _gc_todo_len: meter
.u64_value_observer( .u64_value_observer(
"table.gc_todo_queue_length", "table.gc_todo_queue_length",
move |observer| { move |observer| {
if let Ok(value) = gc_todo.approximate_len() { if let Ok(value) = gc_todo.len() {
observer.observe( observer.observe(
value as u64, value as u64,
&[KeyValue::new("table_name", table_name)], &[KeyValue::new("table_name", table_name)],

View file

@ -27,7 +27,7 @@ impl<F: TableSchema, R: TableReplication> Worker for InsertQueueWorker<F, R> {
fn status(&self) -> WorkerStatus { fn status(&self) -> WorkerStatus {
WorkerStatus { WorkerStatus {
queue_length: Some(self.0.data.insert_queue.approximate_len().unwrap_or(0) as u64), queue_length: Some(self.0.data.insert_queue.len().unwrap_or(0) as u64),
..Default::default() ..Default::default()
} }
} }

Some files were not shown because too many files have changed in this diff Show more