From 7e315a8c70609927ed6a284d3915cda71a731ffb Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Tue, 11 Nov 2025 14:58:51 +0000 Subject: [PATCH 1/6] Merged PR 12878148: bridge: Force sequential message handling for confidential containers [cherry-picked from f81b450894206a79fff4d63182ff034ba503ebdb] This PR contains 2 commits. The first one is the fix: **bridge: Force sequential message handling for confidential containers** This fixes a vulnerability (and reduces the surface for other similar potential vulnerabilities) in confidential containers where if the host sends a mount/unmount modification request concurrently with an ongoing CreateContainer request, the host could re-order or skip image layers for the container to be started. While this could be fixed by adding mutex lock/unlock around the individual modifyMappedVirtualDisk/modifyCombinedLayers/CreateContainer functions, we decided that in order to prevent any more of this class of issues, the UVM, when running in confidential mode, should just not allow concurrent requests (with exception for any actually long-running requests, which for now is just waitProcess). The second one adds a log entry for when the processing thread blocks. This will make it easier to debug should the gcs "hung" on a request. This PR is created on ADO targeting the conf branch as this security vulnerability is not public yet. This fix should be backported to main once deployed. Fix usage of IsSNP() to handle the new error return value Related work items: #33357501, #34327300 Signed-off-by: Tingmao Wang --- cmd/gcs/main.go | 12 +++++ internal/guest/bridge/bridge.go | 96 +++++++++++++++++++++++++-------- 2 files changed, 85 insertions(+), 23 deletions(-) diff --git a/cmd/gcs/main.go b/cmd/gcs/main.go index 0162e7936d..fd9575ec3b 100644 --- a/cmd/gcs/main.go +++ b/cmd/gcs/main.go @@ -4,6 +4,7 @@ package main import ( + "context" "flag" "fmt" "io" @@ -30,6 +31,7 @@ import ( "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/internal/version" + "github.com/Microsoft/hcsshim/pkg/amdsevsnp" "github.com/Microsoft/hcsshim/pkg/securitypolicy" oci "github.com/opencontainers/runtime-spec/specs-go" ) @@ -414,6 +416,16 @@ func main() { mux := bridge.NewBridgeMux() b := bridge.New(mux, *v4) + // For confidential containers, we protect ourselves against attacks caused + // by concurrent modifications, by processing one request at a time. + sequential, err := amdsevsnp.IsSNP() + if err != nil { + // IsSNP cannot fail on LCOW + log.G(context.TODO()).WithError(err).Error("unexpected error from IsSNP") + // If it fails, we proceed with forceSequential enabled to be safe + sequential = true + } + b.Sequential = sequential b.AssignHandlers(mux, h) // Reconnect loop: dial the host, serve until the connection drops, then diff --git a/internal/guest/bridge/bridge.go b/internal/guest/bridge/bridge.go index d8d57e786d..e452d4f5ee 100644 --- a/internal/guest/bridge/bridge.go +++ b/internal/guest/bridge/bridge.go @@ -177,6 +177,10 @@ type Bridge struct { Handler Handler // EnableV4 enables the v4+ bridge and the schema v2+ interfaces. EnableV4 bool + // Setting Sequential to true will force the bridge to only process one + // request at a time, except for certain long-running operations (as defined + // in asyncMessages). + Sequential bool // responseChan is the response channel used for both request/response // and publish notification workflows. @@ -256,6 +260,14 @@ func (b *Bridge) ShutdownRequested() bool { return b.hasQuitPending.Load() } +// Identify messages that will be processed asynchronously even in sequential +// mode. Note that in sequential mode, these messages will still wait for any +// in-progress non-async messages to be handled before they are processed, but +// once they are "acknowledged", the rest will be done asynchronously. +func alwaysAsync(msgType prot.MessageIdentifier) bool { + return msgType == prot.ComputeSystemWaitForProcessV1 +} + // AssignHandlers creates and assigns the appropriate bridge // events to be listen for and intercepted on `mux` before forwarding // to `gcs` for handling. @@ -307,6 +319,10 @@ func (b *Bridge) ListenAndServe(bridgeIn io.ReadCloser, bridgeOut io.WriteCloser defer b.disconnectNotifications() defer close(b.quitChan) + if b.Sequential { + log.G(context.Background()).Info("bridge: ForceSequential enabled") + } + // Receive bridge requests and schedule them to be processed. go func() { var recverr error @@ -409,30 +425,38 @@ func (b *Bridge) ListenAndServe(bridgeIn io.ReadCloser, bridgeOut io.WriteCloser }() // Process each bridge request async and create the response writer. go func() { - for req := range requestChan { - go func(r *Request) { - br := bridgeResponse{ - ctx: r.Context, - header: &prot.MessageHeader{ - Type: prot.GetResponseIdentifier(r.Header.Type), - ID: r.Header.ID, - }, - } - resp, err := b.Handler.ServeMsg(r) - if resp == nil { - resp = &prot.MessageResponseBase{} - } - resp.Base().ActivityID = r.ActivityID - if err != nil { - span := trace.FromContext(r.Context) - if span != nil { - oc.SetSpanStatus(span, err) - } - setErrorForResponseBase(resp.Base(), err, "gcs" /* moduleName */) + doRequest := func(r *Request) { + br := bridgeResponse{ + ctx: r.Context, + header: &prot.MessageHeader{ + Type: prot.GetResponseIdentifier(r.Header.Type), + ID: r.Header.ID, + }, + } + resp, err := b.Handler.ServeMsg(r) + if resp == nil { + resp = &prot.MessageResponseBase{} + } + resp.Base().ActivityID = r.ActivityID + if err != nil { + span := trace.FromContext(r.Context) + if span != nil { + oc.SetSpanStatus(span, err) } - br.response = resp - b.responseChan <- br - }(req) + setErrorForResponseBase(resp.Base(), err, "gcs" /* moduleName */) + } + br.response = resp + b.responseChan <- br + } + + for req := range requestChan { + if b.Sequential && !alwaysAsync(req.Header.Type) { + // This will log warn after 5 seconds if the request is still + // being processed, + runSequentialRequest(req, doRequest) + } else { + go doRequest(req) + } } }() // Process each bridge response sync. This channel is for request/response and publish workflows. @@ -495,6 +519,32 @@ func (b *Bridge) ListenAndServe(bridgeIn io.ReadCloser, bridgeOut io.WriteCloser } } +// Do handleFn(r), but prints a warning if handleFn does not, or takes too long +// to return. +func runSequentialRequest(r *Request, handleFn func(*Request)) { + // Note that this is only a context used for triggering the blockage + // warning, the request processing still uses r.Context. We don't want to + // cancel the request handling itself when we reach the 5s timeout. + timeoutCtx, cancel := context.WithTimeout(r.Context, 5*time.Second) + go func() { + <-timeoutCtx.Done() + if errors.Is(timeoutCtx.Err(), context.DeadlineExceeded) { + log.G(timeoutCtx).WithFields(logrus.Fields{ + // We want to log those even though we're providing r.Context, since if + // the request never finishes the span end log will never get written, + // and we may therefore not be able to find out about the following info + // otherwise: + "message-type": r.Header.Type.String(), + "message-id": r.Header.ID, + "activity-id": r.ActivityID, + "container-id": r.ContainerID, + }).Warnf("bridge: request processing thread in sequential mode blocked on the current request for more than 5 seconds") + } + }() + defer cancel() + handleFn(r) +} + // PublishNotification writes a specific notification to the bridge. func (b *Bridge) PublishNotification(n *prot.ContainerNotification) { ctx, span := oc.StartSpan(context.Background(), From 5a361fe4bb3534504c626de5e520e4efbcd30671 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Mon, 20 Oct 2025 16:52:21 +0000 Subject: [PATCH 2/6] Merged PR 13618357: guest/network: Restrict hostname to valid characters [cherry-picked from 055ee5eb4a802cb407575fb6cc1e9b07069d3319] guest/network: Restrict hostname to valid characters Because we write this hostname to /etc/hosts, without proper validation the host can trick us into writing arbitrary data to /etc/hosts, which can, for example, redirect things like ip6-localhost (but likely not localhost itself) to an attacker-controlled IP address. We implement a check here that the host-provided DNS name in the OCI spec is valid. ACI actually restricts this to 5-63 characters of a-zA-Z0-9 and '-', where the first and last characters can not be '-'. This aligns with the Kubernetes restriction. c.f. IsValidDnsLabel in Compute-ACI. However, there is no consistent official agreement on what a valid hostname can contain. RFC 952 says that "Domain name" can be up to 24 characters of A-Z0-9 '.' and '-', RFC 1123 expands this to 255 characters, but RFC 1035 claims that domain names can in fact contain anything if quoted (as long as the length is within 255 characters), and this is confirmed again in RFC 2181. In practice we see names with underscopes, most commonly \_dmarc.example.com. curl allows 0-9a-zA-Z and -.\_|~ and any other codepoints from \u0001-\u001f and above \u007f: https://github.com/curl/curl/blob/master/lib/urlapi.c#L527-L545 With the above in mind, this commit allows up to 255 characters of a-zA-Z0-9 and '_', '-' and '.'. This change is applied to all LCOW use cases. This fix can be tested with the below code to bypass any host-side checks: +++ b/internal/hcsoci/hcsdoc_lcow.go @@ -52,6 +52,10 @@ func createLCOWSpec(ctx context.Context, coi *createOptionsInternal) (*specs.Spe spec.Linux.Seccomp = nil } + if spec.Annotations[annotations.KubernetesContainerType] == "sandbox" { + spec.Hostname = "invalid-hostname\n1.1.1.1 ip6-localhost ip6-loopback localhost" + } + return spec, nil } Output: time="2025-10-01T15:13:41Z" level=fatal msg="run pod sandbox: rpc error: code = Unknown desc = failed to create containerd task: failed to create shim task: failed to create container f2209bb2960d5162fc9937d3362e1e2cf1724c56d1296ba2551ce510cb2bcd43: guest RPC failure: hostname \"invalid-hostname\\n1.1.1.1 ip6-localhost ip6-loopback localhost\" invalid: must match ^[a-zA-Z0-9_\\-\\.]{0,999}$: unknown" Related work items: #34370598 Closes: https://msazure.visualstudio.com/One/_workitems/edit/34370598 Signed-off-by: Tingmao Wang --- internal/guest/network/network.go | 13 +++++++ internal/guest/network/network_test.go | 35 +++++++++++++++++++ .../guest/runtime/hcsv2/sandbox_container.go | 3 ++ .../runtime/hcsv2/standalone_container.go | 3 ++ 4 files changed, 54 insertions(+) diff --git a/internal/guest/network/network.go b/internal/guest/network/network.go index cc06398e41..f68f8afa39 100644 --- a/internal/guest/network/network.go +++ b/internal/guest/network/network.go @@ -9,6 +9,7 @@ import ( "fmt" "os" "path/filepath" + "regexp" "strings" "time" @@ -32,6 +33,18 @@ var ( // maxDNSSearches is limited to 6 in `man 5 resolv.conf` const maxDNSSearches = 6 +var validHostnameRegex = regexp.MustCompile(`^[a-zA-Z0-9_\-\.]{0,255}$`) + +// Check that the hostname is safe. This function is less strict than +// technically allowed, but ensures that when the hostname is inserted to +// /etc/hosts, it cannot lead to injection attacks. +func ValidateHostname(hostname string) error { + if !validHostnameRegex.MatchString(hostname) { + return errors.Errorf("hostname %q invalid: must match %s", hostname, validHostnameRegex.String()) + } + return nil +} + // GenerateEtcHostsContent generates a /etc/hosts file based on `hostname`. func GenerateEtcHostsContent(ctx context.Context, hostname string) string { _, span := oc.StartSpan(ctx, "network::GenerateEtcHostsContent") diff --git a/internal/guest/network/network_test.go b/internal/guest/network/network_test.go index 9a718aed66..cc0ec1be3a 100644 --- a/internal/guest/network/network_test.go +++ b/internal/guest/network/network_test.go @@ -7,6 +7,7 @@ import ( "context" "os" "path/filepath" + "strings" "testing" "time" ) @@ -70,6 +71,40 @@ func Test_GenerateResolvConfContent(t *testing.T) { } } +func Test_ValidateHostname(t *testing.T) { + validNames := []string{ + "localhost", + "my-hostname", + "my.hostname", + "my-host-name123", + "_underscores.are.allowed.too", + "", // Allow not specifying a hostname + } + + invalidNames := []string{ + "localhost\n13.104.0.1 ip6-localhost ip6-loopback localhost", + "localhost\n2603:1000::1 ip6-localhost ip6-loopback", + "hello@microsoft.com", + "has space", + "has,comma", + "\x00", + "a\nb", + strings.Repeat("a", 1000), + } + + for _, n := range validNames { + if err := ValidateHostname(n); err != nil { + t.Fatalf("expected %q to be valid, got: %v", n, err) + } + } + + for _, n := range invalidNames { + if err := ValidateHostname(n); err == nil { + t.Fatalf("expected %q to be invalid, but got nil error", n) + } + } +} + func Test_GenerateEtcHostsContent(t *testing.T) { type testcase struct { name string diff --git a/internal/guest/runtime/hcsv2/sandbox_container.go b/internal/guest/runtime/hcsv2/sandbox_container.go index 5f3d80356b..c6623d52f1 100644 --- a/internal/guest/runtime/hcsv2/sandbox_container.go +++ b/internal/guest/runtime/hcsv2/sandbox_container.go @@ -50,6 +50,9 @@ func setupSandboxContainerSpec(ctx context.Context, id, sandboxRoot string, spec // Write the hostname hostname := spec.Hostname + if err = network.ValidateHostname(hostname); err != nil { + return err + } if hostname == "" { var err error hostname, err = os.Hostname() diff --git a/internal/guest/runtime/hcsv2/standalone_container.go b/internal/guest/runtime/hcsv2/standalone_container.go index 4cf8cc72ec..27b6f8471b 100644 --- a/internal/guest/runtime/hcsv2/standalone_container.go +++ b/internal/guest/runtime/hcsv2/standalone_container.go @@ -46,6 +46,9 @@ func setupStandaloneContainerSpec(ctx context.Context, id, rootDir string, spec }() hostname := spec.Hostname + if err = network.ValidateHostname(hostname); err != nil { + return err + } if hostname == "" { var err error hostname, err = os.Hostname() From b214dc8e479f0ed6b34326227cc017f5de31ed00 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Tue, 11 Nov 2025 16:48:20 +0000 Subject: [PATCH 3/6] Merged PR 12878779: Enhance uvm_state::hostMounts to track in-use mounts, and prevent unmounting or deleting in-use things [cherry-picked from d0334883cd43eecbb401a6ded3e0317179a3e54b] This set of changes adds some checks (when running with a confidential policy) to prevent the host from trying to clean up mounts, overlays, or the container states dir when the container is running (or when the overlay has not been unmounted yet). This is through enhancing the existing `hostMounts` utility, as well as adding a `terminated` flag to the Container struct. The correct order of operations should always be: - mount read-only layers and scratch (in any order, and individual containers (not the sandbox) might not have their own scratch) - mount the overlay - start the container - container terminates - unmount overlay - unmount read-only layers and scratch The starting up order is implied, and we now explicitly deny e.g. unmounting layer/scratch before unmounting overlay, or unmounting the overlay while container has not terminated. We also deny deleteContainerState requests when the container is running or when the overlay is mounted. Doing so when a container is running can result in unexpectedly deleting its files, which breaks it in unpredictable ways and is bad. Signed-off-by: Tingmao Wang --- internal/guest/bridge/bridge_v2.go | 8 +- internal/guest/runtime/hcsv2/container.go | 3 + internal/guest/runtime/hcsv2/process.go | 3 + internal/guest/runtime/hcsv2/uvm.go | 246 +++++++++--- internal/guest/runtime/hcsv2/uvm_state.go | 357 +++++++++++++++--- .../guest/runtime/hcsv2/uvm_state_test.go | 219 ++++++++++- 6 files changed, 738 insertions(+), 98 deletions(-) diff --git a/internal/guest/bridge/bridge_v2.go b/internal/guest/bridge/bridge_v2.go index dc13ec6428..748e9dcd9b 100644 --- a/internal/guest/bridge/bridge_v2.go +++ b/internal/guest/bridge/bridge_v2.go @@ -467,16 +467,10 @@ func (b *Bridge) deleteContainerStateV2(r *Request) (_ RequestResponse, err erro return nil, errors.Wrapf(err, "failed to unmarshal JSON in message \"%s\"", r.Message) } - c, err := b.hostState.GetCreatedContainer(request.ContainerID) + err = b.hostState.DeleteContainerState(ctx, request.ContainerID) if err != nil { return nil, err } - // remove container state regardless of delete's success - defer b.hostState.RemoveContainer(request.ContainerID) - - if err := c.Delete(ctx); err != nil { - return nil, err - } return &prot.MessageResponseBase{}, nil } diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go index 13c0231801..15d416ae96 100644 --- a/internal/guest/runtime/hcsv2/container.go +++ b/internal/guest/runtime/hcsv2/container.go @@ -77,6 +77,9 @@ type Container struct { // and deal with the extra pointer dereferencing overhead. status atomic.Uint32 + // Set to true when the init process for the container has exited + terminated atomic.Bool + // scratchDirPath represents the path inside the UVM where the scratch directory // of this container is located. Usually, this is either `/run/gcs/c/` or // `/run/gcs/c//container_` if scratch is shared with UVM scratch. diff --git a/internal/guest/runtime/hcsv2/process.go b/internal/guest/runtime/hcsv2/process.go index e94c9792f6..b4a9b60f72 100644 --- a/internal/guest/runtime/hcsv2/process.go +++ b/internal/guest/runtime/hcsv2/process.go @@ -99,6 +99,9 @@ func newProcess(c *Container, spec *oci.Process, process runtime.Process, pid ui log.G(ctx).WithError(err).Error("failed to wait for runc process") } p.exitCode = exitCode + if p.init { + c.terminated.Store(true) + } log.G(ctx).WithField("exitCode", p.exitCode).Debug("process exited") // Free any process waiters diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 513fe00acf..63545fa7ef 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -17,6 +17,7 @@ import ( "regexp" "strings" "sync" + "sync/atomic" "syscall" "time" @@ -25,6 +26,7 @@ import ( "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" + "go.opencensus.io/trace" "golang.org/x/sys/unix" "github.com/Microsoft/hcsshim/internal/bridgeutils/gcserr" @@ -44,6 +46,7 @@ import ( "github.com/Microsoft/hcsshim/internal/guestpath" "github.com/Microsoft/hcsshim/internal/log" "github.com/Microsoft/hcsshim/internal/logfields" + "github.com/Microsoft/hcsshim/internal/oc" "github.com/Microsoft/hcsshim/internal/oci" "github.com/Microsoft/hcsshim/internal/protocol/guestrequest" "github.com/Microsoft/hcsshim/internal/protocol/guestresource" @@ -114,7 +117,9 @@ type Host struct { securityOptions *securitypolicy.SecurityOptions // hostMounts keeps the state of currently mounted devices and file systems, - // which is used for GCS hardening. + // which is used for GCS hardening. It is only used for confidential + // containers, and is initialized in SetConfidentialUVMOptions. If this is + // nil, we do not do add any special restrictions on mounts / unmounts. hostMounts *hostMounts // uvmError contains a permanent flag to indicate that further mounts, // unmounts and container creation / deletion should not be allowed. This @@ -172,7 +177,7 @@ func NewHost(rtime runtime.Runtime, vsock transport.Transport, initialEnforcer s rtime: rtime, vsock: vsock, devNullTransport: &transport.DevNullTransport{}, - hostMounts: newHostMounts(), + hostMounts: nil, securityOptions: securityPolicyOptions, uvmError: uvmConsistencyError{}, } @@ -501,6 +506,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM isSandbox: criType == "sandbox", exitType: prot.NtUnexpectedExit, processes: make(map[uint32]*containerProcess), + terminated: atomic.Bool{}, scratchDirPath: settings.ScratchDirPath, } c.setStatus(containerCreating) @@ -809,6 +815,25 @@ func writeSpecToFile(ctx context.Context, configFile string, spec *specs.Spec) e return nil } +// Returns whether there is a running container that is currently using the +// given overlay (as its rootfs). +func (h *Host) IsOverlayInUse(overlayPath string) bool { + h.containersMutex.Lock() + defer h.containersMutex.Unlock() + + for _, c := range h.containers { + if c.terminated.Load() { + continue + } + + if c.spec.Root.Path == overlayPath { + return true + } + } + + return false +} + func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req *guestrequest.ModificationRequest) (retErr error) { if h.HasSecurityPolicy() { if err := checkValidContainerID(containerID, "container"); err != nil { @@ -832,35 +857,6 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * return err } mvd.Controller = cNum - // first we try to update the internal state for read-write attachments. - if !mvd.ReadOnly { - localCtx, cancel := context.WithTimeout(ctx, time.Second*5) - defer cancel() - source, err := scsi.GetDevicePath(localCtx, mvd.Controller, mvd.Lun, mvd.Partition) - if err != nil { - return err - } - switch req.RequestType { - case guestrequest.RequestTypeAdd: - if err := h.hostMounts.AddRWDevice(mvd.MountPath, source, mvd.Encrypted); err != nil { - return err - } - defer func() { - if retErr != nil { - _ = h.hostMounts.RemoveRWDevice(mvd.MountPath, source) - } - }() - case guestrequest.RequestTypeRemove: - if err := h.hostMounts.RemoveRWDevice(mvd.MountPath, source); err != nil { - return err - } - defer func() { - if retErr != nil { - _ = h.hostMounts.AddRWDevice(mvd.MountPath, source, mvd.Encrypted) - } - }() - } - } return h.modifyMappedVirtualDisk(ctx, req.RequestType, mvd) case guestresource.ResourceTypeMappedDirectory: if err := h.checkState(); err != nil { @@ -879,12 +875,7 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * return err } - cl := req.Settings.(*guestresource.LCOWCombinedLayers) - // when cl.ScratchPath == "", we mount overlay as read-only, in which case - // we don't really care about scratch encryption, since the host already - // knows about the layers and the overlayfs. - encryptedScratch := cl.ScratchPath != "" && h.hostMounts.IsEncrypted(cl.ScratchPath) - return h.modifyCombinedLayers(ctx, req.RequestType, req.Settings.(*guestresource.LCOWCombinedLayers), encryptedScratch) + return h.modifyCombinedLayers(ctx, req.RequestType, req.Settings.(*guestresource.LCOWCombinedLayers)) case guestresource.ResourceTypeNetwork: return modifyNetwork(ctx, req.RequestType, req.Settings.(*guestresource.LCOWNetworkAdapter)) case guestresource.ResourceTypeVPCIDevice: @@ -900,11 +891,23 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * if !ok { return errors.New("the request's settings are not of type ConfidentialOptions") } - return h.securityOptions.SetConfidentialOptions(ctx, + err := h.securityOptions.SetConfidentialOptions(ctx, r.EnforcerType, r.EncodedSecurityPolicy, r.EncodedUVMReference, r.EncodedUVMHashEnvelopeReference) + if err != nil { + return err + } + + // Start tracking mounts and restricting unmounts on confidential containers. + // As long as we started off with the ClosedDoorSecurityPolicyEnforcer, no + // mounts should have been allowed until this point. + if h.HasSecurityPolicy() { + log.G(ctx).Debug("hostMounts initialized") + h.hostMounts = newHostMounts() + } + return nil case guestresource.ResourceTypePolicyFragment: r, ok := req.Settings.(*guestresource.SecurityPolicyFragment) if !ok { @@ -1278,18 +1281,33 @@ func (h *Host) modifyMappedVirtualDisk( rt guestrequest.RequestType, mvd *guestresource.LCOWMappedVirtualDisk, ) (err error) { + ctx, span := oc.StartSpan(ctx, "gcs::Host::modifyMappedVirtualDisk") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes( + trace.StringAttribute("requestType", string(rt)), + trace.BoolAttribute("hasHostMounts", h.hostMounts != nil), + trace.Int64Attribute("controller", int64(mvd.Controller)), + trace.Int64Attribute("lun", int64(mvd.Lun)), + trace.Int64Attribute("partition", int64(mvd.Partition)), + trace.BoolAttribute("readOnly", mvd.ReadOnly), + trace.StringAttribute("mountPath", mvd.MountPath), + ) + var verityInfo *guestresource.DeviceVerityInfo securityPolicy := h.securityOptions.PolicyEnforcer + devPath, err := scsi.GetDevicePath(ctx, mvd.Controller, mvd.Lun, mvd.Partition) + if err != nil { + return err + } + span.AddAttributes(trace.StringAttribute("devicePath", devPath)) + if mvd.ReadOnly { // The only time the policy is empty, and we want it to be empty // is when no policy is provided, and we default to open door // policy. In any other case, e.g. explicit open door or any // other rego policy we would like to mount layers with verity. if h.HasSecurityPolicy() { - devPath, err := scsi.GetDevicePath(ctx, mvd.Controller, mvd.Lun, mvd.Partition) - if err != nil { - return err - } verityInfo, err = verity.ReadVeritySuperBlock(ctx, devPath) if err != nil { return err @@ -1319,11 +1337,42 @@ func (h *Host) modifyMappedVirtualDisk( if err != nil { return errors.Wrapf(err, "mounting scsi device controller %d lun %d onto %s denied by policy", mvd.Controller, mvd.Lun, mvd.MountPath) } + if h.hostMounts != nil { + h.hostMounts.Lock() + defer h.hostMounts.Unlock() + + err = h.hostMounts.AddRODevice(mvd.MountPath, devPath) + if err != nil { + return err + } + // Note: "When a function returns, its deferred calls are + // executed in last-in-first-out order." - so we are safe to + // call RemoveRODevice in this defer. + defer func() { + if err != nil { + _ = h.hostMounts.RemoveRODevice(mvd.MountPath, devPath) + } + }() + } } else { err = securityPolicy.EnforceRWDeviceMountPolicy(ctx, mvd.MountPath, mvd.Encrypted, mvd.EnsureFilesystem, mvd.Filesystem) if err != nil { return errors.Wrapf(err, "mounting scsi device controller %d lun %d onto %s denied by policy", mvd.Controller, mvd.Lun, mvd.MountPath) } + if h.hostMounts != nil { + h.hostMounts.Lock() + defer h.hostMounts.Unlock() + + err = h.hostMounts.AddRWDevice(mvd.MountPath, devPath, mvd.Encrypted) + if err != nil { + return err + } + defer func() { + if err != nil { + _ = h.hostMounts.RemoveRWDevice(mvd.MountPath, devPath, mvd.Encrypted) + } + }() + } } config := &scsi.Config{ Encrypted: mvd.Encrypted, @@ -1348,10 +1397,36 @@ func (h *Host) modifyMappedVirtualDisk( if err = securityPolicy.EnforceDeviceUnmountPolicy(ctx, mvd.MountPath); err != nil { return fmt.Errorf("unmounting scsi device at %s denied by policy: %w", mvd.MountPath, err) } + if h.hostMounts != nil { + h.hostMounts.Lock() + defer h.hostMounts.Unlock() + + if err = h.hostMounts.RemoveRODevice(mvd.MountPath, devPath); err != nil { + return err + } + defer func() { + if err != nil { + _ = h.hostMounts.AddRODevice(mvd.MountPath, devPath) + } + }() + } } else { if err = securityPolicy.EnforceRWDeviceUnmountPolicy(ctx, mvd.MountPath); err != nil { return fmt.Errorf("unmounting scsi device at %s denied by policy: %w", mvd.MountPath, err) } + if h.hostMounts != nil { + h.hostMounts.Lock() + defer h.hostMounts.Unlock() + + if err = h.hostMounts.RemoveRWDevice(mvd.MountPath, devPath, mvd.Encrypted); err != nil { + return err + } + defer func() { + if err != nil { + _ = h.hostMounts.AddRWDevice(mvd.MountPath, devPath, mvd.Encrypted) + } + }() + } } // Check that the directory actually exists first, and if it // does not then we just refuse to do anything, without closing @@ -1521,8 +1596,17 @@ func (h *Host) modifyCombinedLayers( ctx context.Context, rt guestrequest.RequestType, cl *guestresource.LCOWCombinedLayers, - scratchEncrypted bool, ) (err error) { + ctx, span := oc.StartSpan(ctx, "gcs::Host::modifyCombinedLayers") + defer span.End() + defer func() { oc.SetSpanStatus(span, err) }() + span.AddAttributes( + trace.StringAttribute("requestType", string(rt)), + trace.BoolAttribute("hasHostMounts", h.hostMounts != nil), + trace.StringAttribute("containerRootPath", cl.ContainerRootPath), + trace.StringAttribute("scratchPath", cl.ScratchPath), + ) + securityPolicy := h.securityOptions.PolicyEnforcer containerID := cl.ContainerID @@ -1531,6 +1615,12 @@ func (h *Host) modifyCombinedLayers( // we've actually called Unmount and it fails we permanently block further // device operations. return securityPolicy.WithMetadataRollback(func() error { + if h.hostMounts != nil { + // We will need this in multiple places, let's take the lock once here. + h.hostMounts.Lock() + defer h.hostMounts.Unlock() + } + switch rt { case guestrequest.RequestTypeAdd: if h.HasSecurityPolicy() { @@ -1578,15 +1668,29 @@ func (h *Host) modifyCombinedLayers( } else { upperdirPath = filepath.Join(cl.ScratchPath, "upper") workdirPath = filepath.Join(cl.ScratchPath, "work") + scratchEncrypted := false + if h.hostMounts != nil { + scratchEncrypted = h.hostMounts.IsEncrypted(cl.ScratchPath) + } if err := securityPolicy.EnforceScratchMountPolicy(ctx, cl.ScratchPath, scratchEncrypted); err != nil { return fmt.Errorf("scratch mounting denied by policy: %w", err) } } - if err := securityPolicy.EnforceOverlayMountPolicy(ctx, containerID, layerPaths, cl.ContainerRootPath); err != nil { + if err = securityPolicy.EnforceOverlayMountPolicy(ctx, containerID, layerPaths, cl.ContainerRootPath); err != nil { return fmt.Errorf("overlay creation denied by policy: %w", err) } + if h.hostMounts != nil { + if err = h.hostMounts.AddOverlay(cl.ContainerRootPath, layerPaths, cl.ScratchPath); err != nil { + return err + } + defer func() { + if err != nil { + _, _ = h.hostMounts.RemoveOverlay(cl.ContainerRootPath) + } + }() + } // Correctness for policy transaction rollback: // MountLayer does two things - mkdir, then mount. On mount failure, the @@ -1600,6 +1704,23 @@ func (h *Host) modifyCombinedLayers( return errors.Wrap(err, "overlay removal denied by policy") } + // Check that no running container is using this overlay as its rootfs. + if h.HasSecurityPolicy() && h.IsOverlayInUse(cl.ContainerRootPath) { + return fmt.Errorf("overlay %q is in use by a running container", cl.ContainerRootPath) + } + + if h.hostMounts != nil { + var undoRemoveOverlay func() + if undoRemoveOverlay, err = h.hostMounts.RemoveOverlay(cl.ContainerRootPath); err != nil { + return err + } + defer func() { + if err != nil && undoRemoveOverlay != nil { + undoRemoveOverlay() + } + }() + } + // Note: storage.UnmountPath is a no-op if the path does not exist. err = storage.UnmountPath(ctx, cl.ContainerRootPath, true) if err != nil { @@ -1737,3 +1858,40 @@ func (h *Host) createContainerInPod(sandboxID string, containerID string) error return nil } + +func (h *Host) DeleteContainerState(ctx context.Context, containerID string) error { + if h.HasSecurityPolicy() { + if err := checkValidContainerID(containerID, "container"); err != nil { + return err + } + } + + if err := h.checkState(); err != nil { + return err + } + + c, err := h.GetCreatedContainer(containerID) + if err != nil { + return err + } + if h.HasSecurityPolicy() { + if !c.terminated.Load() { + return errors.Errorf("Denied deleting state of a running container %q", containerID) + } + overlay := c.spec.Root.Path + h.hostMounts.Lock() + defer h.hostMounts.Unlock() + if h.hostMounts.HasOverlayMountedAt(overlay) { + return errors.Errorf("Denied deleting state of a container with a overlay mount still active") + } + } + + // remove container state regardless of delete's success + defer h.RemoveContainer(containerID) + + if err = c.Delete(ctx); err != nil { + return err + } + + return nil +} diff --git a/internal/guest/runtime/hcsv2/uvm_state.go b/internal/guest/runtime/hcsv2/uvm_state.go index dd1ff521f0..f7d347cbf4 100644 --- a/internal/guest/runtime/hcsv2/uvm_state.go +++ b/internal/guest/runtime/hcsv2/uvm_state.go @@ -4,91 +4,358 @@ package hcsv2 import ( + "context" "fmt" "path/filepath" "strings" "sync" + + "github.com/Microsoft/hcsshim/internal/log" + "github.com/sirupsen/logrus" +) + +type deviceType int + +const ( + DeviceTypeRW deviceType = iota + DeviceTypeRO + DeviceTypeOverlay ) -type rwDevice struct { +func (d deviceType) String() string { + switch d { + case DeviceTypeRW: + return "RW" + case DeviceTypeRO: + return "RO" + case DeviceTypeOverlay: + return "Overlay" + default: + return fmt.Sprintf("Unknown(%d)", d) + } +} + +type device struct { + // fields common to all mountPath string + ty deviceType + usage int sourcePath string - encrypted bool + + // rw devices + encrypted bool + + // overlay devices + referencedDevices []*device } +// hostMounts tracks the state of fs/overlay mounts and their usage +// relationship. Users of this struct must call hm.Lock() before calling any +// other methods and call hm.Unlock() when done. +// +// Since mount/unmount operations can fail, the expected way to use this struct +// is to first lock it, call the method to add/remove the device, then, with the +// lock still held, perform the actual operation. If the operation fails, the +// caller must undo the operation by calling the appropriate remove/add method +// or the returned undo function, before unlocking. type hostMounts struct { - stateMutex sync.Mutex + stateMutex sync.Mutex + stateMutexLocked bool - // Holds information about read-write devices, which can be encrypted and - // contain overlay fs upper/work directory mounts. - readWriteMounts map[string]*rwDevice + // Map from mountPath to device struct + devices map[string]*device } func newHostMounts() *hostMounts { return &hostMounts{ - readWriteMounts: map[string]*rwDevice{}, + devices: make(map[string]*device), } } -// AddRWDevice adds read-write device metadata for device mounted at `mountPath`. -// Returns an error if there's an existing device mounted at `mountPath` location. -func (hm *hostMounts) AddRWDevice(mountPath string, sourcePath string, encrypted bool) error { +func (hm *hostMounts) expectLocked() { + if !hm.stateMutexLocked { + panic("hostMounts: expected stateMutex to be locked, but it was not") + } +} + +// Locks the state mutex. This is not re-entrant, calling it twice in the same +// thread will deadlock/panic. +func (hm *hostMounts) Lock() { hm.stateMutex.Lock() - defer hm.stateMutex.Unlock() + // Since we just acquired the lock, either it was not locked before, or + // somebody just unlocked it. Either case, hm.stateMutexLocked should be + // false. + if hm.stateMutexLocked { + panic("hostMounts: stateMutexLocked already true when locking stateMutex") + } + hm.stateMutexLocked = true +} + +// Unlocks the state mutex +func (hm *hostMounts) Unlock() { + hm.expectLocked() + hm.stateMutexLocked = false + hm.stateMutex.Unlock() +} - mountTarget := filepath.Clean(mountPath) - if source, ok := hm.readWriteMounts[mountTarget]; ok { - return fmt.Errorf("read-write with source %q and mount target %q already exists", source.sourcePath, mountPath) +func (hm *hostMounts) findDeviceAtPath(mountPath string) *device { + hm.expectLocked() + + if dev, ok := hm.devices[mountPath]; ok { + return dev } - hm.readWriteMounts[mountTarget] = &rwDevice{ - mountPath: mountTarget, - sourcePath: sourcePath, - encrypted: encrypted, + return nil +} + +func (hm *hostMounts) addDeviceToMapChecked(dev *device) error { + hm.expectLocked() + + if _, ok := hm.devices[dev.mountPath]; ok { + return fmt.Errorf("device at mount path %q already exists", dev.mountPath) } + hm.devices[dev.mountPath] = dev return nil } -// RemoveRWDevice removes the read-write device metadata for device mounted at -// `mountPath`. -func (hm *hostMounts) RemoveRWDevice(mountPath string, sourcePath string) error { - hm.stateMutex.Lock() - defer hm.stateMutex.Unlock() +func (hm *hostMounts) findDeviceContainingPath(path string) *device { + hm.expectLocked() + + // TODO: can we refactor this function by walking each component of the path + // from leaf to root, each time checking if the current component is a mount + // point? (i.e. why do we have to use filepath.Rel?) + + var foundDev *device + cleanPath := filepath.Clean(path) + for devPath, dev := range hm.devices { + relPath, err := filepath.Rel(devPath, cleanPath) + // skip further checks if an error is returned or the relative path + // contains "..", meaning that the `path` isn't directly nested under + // `rwPath`. + if err != nil || strings.HasPrefix(relPath, "..") { + continue + } + if foundDev == nil { + foundDev = dev + } else if len(dev.mountPath) > len(foundDev.mountPath) { + // The current device is mounted on top of a previously found device. + foundDev = dev + } + } + return foundDev +} + +func (hm *hostMounts) usePath(path string) (*device, error) { + hm.expectLocked() + + // Find the device at the given path and increment its usage count. + dev := hm.findDeviceContainingPath(path) + if dev == nil { + return nil, nil + } + dev.usage++ + return dev, nil +} + +func (hm *hostMounts) releaseDeviceUsage(dev *device) { + hm.expectLocked() + + if dev.usage <= 0 { + log.G(context.Background()).WithFields(logrus.Fields{ + "device": dev.mountPath, + "deviceSource": dev.sourcePath, + "deviceType": dev.ty, + "usage": dev.usage, + }).Error("hostMounts::releaseDeviceUsage: unexpected zero usage count") + return + } + dev.usage-- +} + +// User should carefully handle side-effects of adding a device if the device +// fails to be added. +func (hm *hostMounts) doAddDevice(mountPath string, ty deviceType, sourcePath string) (*device, error) { + hm.expectLocked() + + dev := &device{ + mountPath: filepath.Clean(mountPath), + ty: ty, + usage: 0, + sourcePath: sourcePath, + } + + if err := hm.addDeviceToMapChecked(dev); err != nil { + return nil, err + } + return dev, nil +} + +// Once checks is called, unless it returns an error, this function will always +// succeed +func (hm *hostMounts) doRemoveDevice(mountPath string, ty deviceType, sourcePath string, checks func(*device) error) error { + hm.expectLocked() unmountTarget := filepath.Clean(mountPath) - device, ok := hm.readWriteMounts[unmountTarget] - if !ok { + device := hm.findDeviceAtPath(unmountTarget) + if device == nil { // already removed or didn't exist return nil } if device.sourcePath != sourcePath { - return fmt.Errorf("wrong sourcePath %s", sourcePath) + return fmt.Errorf("wrong sourcePath %s, expected %s", sourcePath, device.sourcePath) + } + if device.ty != ty { + return fmt.Errorf("wrong device type %s, expected %s", ty, device.ty) + } + if device.usage > 0 { + log.G(context.Background()).WithFields(logrus.Fields{ + "device": device.mountPath, + "deviceSource": device.sourcePath, + "deviceType": device.ty, + "usage": device.usage, + }).Error("hostMounts::doRemoveDevice: device still in use, refusing unmount") + return fmt.Errorf("device at %q is still in use, can't unmount", unmountTarget) + } + if checks != nil { + if err := checks(device); err != nil { + return err + } } - delete(hm.readWriteMounts, unmountTarget) + delete(hm.devices, unmountTarget) return nil } +func (hm *hostMounts) AddRODevice(mountPath string, sourcePath string) error { + hm.expectLocked() + + _, err := hm.doAddDevice(mountPath, DeviceTypeRO, sourcePath) + return err +} + +// AddRWDevice adds read-write device metadata for device mounted at `mountPath`. +// Returns an error if there's an existing device mounted at `mountPath` location. +func (hm *hostMounts) AddRWDevice(mountPath string, sourcePath string, encrypted bool) error { + hm.expectLocked() + + dev, err := hm.doAddDevice(mountPath, DeviceTypeRW, sourcePath) + if err != nil { + return err + } + dev.encrypted = encrypted + return nil +} + +func (hm *hostMounts) AddOverlay(mountPath string, layers []string, scratchDir string) (err error) { + hm.expectLocked() + + dev, err := hm.doAddDevice(mountPath, DeviceTypeOverlay, mountPath) + if err != nil { + return err + } + dev.referencedDevices = make([]*device, 0, len(layers)+1) + defer func() { + if err != nil { + // If we failed to use any of the paths, we need to release the ones + // that we did use. + for _, d := range dev.referencedDevices { + hm.releaseDeviceUsage(d) + } + delete(hm.devices, mountPath) + } + }() + + for _, layer := range layers { + refDev, err := hm.usePath(layer) + if err != nil { + return err + } + if refDev != nil { + dev.referencedDevices = append(dev.referencedDevices, refDev) + } + } + refDev, err := hm.usePath(scratchDir) + if err != nil { + return err + } + if refDev != nil { + dev.referencedDevices = append(dev.referencedDevices, refDev) + } + + return nil +} + +func (hm *hostMounts) RemoveRODevice(mountPath string, sourcePath string) error { + hm.expectLocked() + + return hm.doRemoveDevice(mountPath, DeviceTypeRO, sourcePath, nil) +} + +// RemoveRWDevice removes the read-write device metadata for device mounted at +// `mountPath`. +func (hm *hostMounts) RemoveRWDevice(mountPath string, sourcePath string, encrypted bool) error { + hm.expectLocked() + + return hm.doRemoveDevice(mountPath, DeviceTypeRW, sourcePath, func(dev *device) error { + if dev.encrypted != encrypted { + return fmt.Errorf("encrypted flag wrong, provided %v, expected %v", encrypted, dev.encrypted) + } + return nil + }) +} + +func (hm *hostMounts) RemoveOverlay(mountPath string) (undo func(), err error) { + hm.expectLocked() + + var dev *device + err = hm.doRemoveDevice(mountPath, DeviceTypeOverlay, mountPath, func(_dev *device) error { + dev = _dev + for _, refDev := range dev.referencedDevices { + hm.releaseDeviceUsage(refDev) + } + return nil + }) + if err != nil { + // If we get an error from doRemoveDevice, we have not released anything + // yet. + return nil, err + } + undo = func() { + hm.expectLocked() + + for _, refDev := range dev.referencedDevices { + refDev.usage++ + } + + if _, ok := hm.devices[mountPath]; ok { + log.G(context.Background()).WithField("mountPath", mountPath).Error( + "hostMounts::RemoveOverlay: failed to undo remove: device that was removed exists in map", + ) + return + } + + hm.devices[mountPath] = dev + } + return undo, nil +} + // IsEncrypted checks if the given path is a sub-path of an encrypted read-write // device. func (hm *hostMounts) IsEncrypted(path string) bool { - hm.stateMutex.Lock() - defer hm.stateMutex.Unlock() + hm.expectLocked() - parentPath := "" - encrypted := false - cleanPath := filepath.Clean(path) - for rwPath, rwDev := range hm.readWriteMounts { - relPath, err := filepath.Rel(rwPath, cleanPath) - // skip further checks if an error is returned or the relative path - // contains "..", meaning that the `path` isn't directly nested under - // `rwPath`. - if err != nil || strings.HasPrefix(relPath, "..") { - continue - } - if len(rwDev.mountPath) > len(parentPath) { - parentPath = rwDev.mountPath - encrypted = rwDev.encrypted - } + dev := hm.findDeviceContainingPath(path) + if dev == nil { + return false + } + return dev.encrypted +} + +func (hm *hostMounts) HasOverlayMountedAt(path string) bool { + hm.expectLocked() + + dev := hm.findDeviceAtPath(filepath.Clean(path)) + if dev == nil { + return false } - return encrypted + return dev.ty == DeviceTypeOverlay } diff --git a/internal/guest/runtime/hcsv2/uvm_state_test.go b/internal/guest/runtime/hcsv2/uvm_state_test.go index b708caaeba..e87a207308 100644 --- a/internal/guest/runtime/hcsv2/uvm_state_test.go +++ b/internal/guest/runtime/hcsv2/uvm_state_test.go @@ -12,10 +12,13 @@ func Test_Add_Remove_RWDevice(t *testing.T) { mountPath := "/run/gcs/c/abcd" sourcePath := "/dev/sda" + hm.Lock() + defer hm.Unlock() + if err := hm.AddRWDevice(mountPath, sourcePath, false); err != nil { t.Fatalf("unexpected error adding RW device: %s", err) } - if err := hm.RemoveRWDevice(mountPath, sourcePath); err != nil { + if err := hm.RemoveRWDevice(mountPath, sourcePath, false); err != nil { t.Fatalf("unexpected error removing RW device: %s", err) } } @@ -25,29 +28,55 @@ func Test_Cannot_AddRWDevice_Twice(t *testing.T) { mountPath := "/run/gcs/c/abc" sourcePath := "/dev/sda" + hm.Lock() if err := hm.AddRWDevice(mountPath, sourcePath, false); err != nil { t.Fatalf("unexpected error: %s", err) } + hm.Unlock() + + hm.Lock() if err := hm.AddRWDevice(mountPath, sourcePath, false); err == nil { t.Fatalf("expected error adding %q for the second time", mountPath) } + hm.Unlock() } func Test_Cannot_RemoveRWDevice_Wrong_Source(t *testing.T) { hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + mountPath := "/run/gcs/c/abcd" sourcePath := "/dev/sda" wrongSource := "/dev/sdb" if err := hm.AddRWDevice(mountPath, sourcePath, false); err != nil { t.Fatalf("unexpected error: %s", err) } - if err := hm.RemoveRWDevice(mountPath, wrongSource); err == nil { + if err := hm.RemoveRWDevice(mountPath, wrongSource, false); err == nil { t.Fatalf("expected error removing wrong source %s", wrongSource) } } +func Test_Cannot_RemoveRWDevice_Wrong_Encrypted(t *testing.T) { + hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + + mountPath := "/run/gcs/c/abcd" + sourcePath := "/dev/sda" + if err := hm.AddRWDevice(mountPath, sourcePath, false); err != nil { + t.Fatalf("unexpected error: %s", err) + } + if err := hm.RemoveRWDevice(mountPath, sourcePath, true); err == nil { + t.Fatalf("expected error removing RW device with wrong encrypted flag") + } +} + func Test_HostMounts_IsEncrypted(t *testing.T) { hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + encryptedPath := "/run/gcs/c/encrypted" encryptedSource := "/dev/sda" if err := hm.AddRWDevice(encryptedPath, encryptedSource, true); err != nil { @@ -108,3 +137,189 @@ func Test_HostMounts_IsEncrypted(t *testing.T) { }) } } + +func Test_HostMounts_AddRemoveRODevice(t *testing.T) { + hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + + mountPath := "/run/gcs/c/abcd" + sourcePath := "/dev/sda" + + if err := hm.AddRODevice(mountPath, sourcePath); err != nil { + t.Fatalf("unexpected error adding RO device: %s", err) + } + + if err := hm.RemoveRODevice(mountPath, sourcePath); err != nil { + t.Fatalf("unexpected error removing RO device: %s", err) + } +} + +func Test_HostMounts_Cannot_AddRODevice_Twice(t *testing.T) { + hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + + mountPath := "/run/gcs/c/abc" + sourcePath := "/dev/sda" + + if err := hm.AddRODevice(mountPath, sourcePath); err != nil { + t.Fatalf("unexpected error: %s", err) + } + if err := hm.AddRODevice(mountPath, sourcePath); err == nil { + t.Fatalf("expected error adding %q for the second time", mountPath) + } +} + +func Test_HostMounts_AddRemoveOverlay(t *testing.T) { + hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + + mountPath := "/run/gcs/c/aaaa/rootfs" + layers := []string{ + "/run/mounts/scsi/m1", + "/run/mounts/scsi/m2", + "/run/mounts/scsi/m3", + } + for _, layer := range layers { + if err := hm.AddRODevice(layer, layer); err != nil { + t.Fatalf("unexpected error adding RO device: %s", err) + } + } + scratchDir := "/run/gcs/c/aaaa/scratch" + if err := hm.AddRWDevice(scratchDir, scratchDir, true); err != nil { + t.Fatalf("unexpected error adding RW device: %s", err) + } + if err := hm.AddOverlay(mountPath, layers, scratchDir); err != nil { + t.Fatalf("unexpected error adding overlay: %s", err) + } + undo, err := hm.RemoveOverlay(mountPath) + if err != nil { + t.Fatalf("unexpected error removing overlay: %s", err) + } + if undo == nil { + t.Fatalf("expected undo function to be non-nil") + } + undo() + if _, err = hm.RemoveOverlay(mountPath); err != nil { + t.Fatalf("unexpected error removing overlay again: %s", err) + } +} + +func Test_HostMounts_Cannot_RemoveInUseDeviceByOverlay(t *testing.T) { + hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + + mountPath := "/run/gcs/c/aaaa/rootfs" + layers := []string{ + "/run/mounts/scsi/m1", + "/run/mounts/scsi/m2", + "/run/mounts/scsi/m3", + } + for _, layer := range layers { + if err := hm.AddRODevice(layer, layer); err != nil { + t.Fatalf("unexpected error adding RO device: %s", err) + } + } + scratchDir := "/run/gcs/c/aaaa/scratch" + if err := hm.AddRWDevice(scratchDir, scratchDir, true); err != nil { + t.Fatalf("unexpected error adding RW device: %s", err) + } + if err := hm.AddOverlay(mountPath, layers, scratchDir); err != nil { + t.Fatalf("unexpected error adding overlay: %s", err) + } + + for _, layer := range layers { + if err := hm.RemoveRODevice(layer, layer); err == nil { + t.Fatalf("expected error removing RO device %s while in use by overlay", layer) + } + } + if err := hm.RemoveRWDevice(scratchDir, scratchDir, true); err == nil { + t.Fatalf("expected error removing RW device %s while in use by overlay", scratchDir) + } + + if _, err := hm.RemoveOverlay(mountPath); err != nil { + t.Fatalf("unexpected error removing overlay: %s", err) + } + + // now we can remove + for _, layer := range layers { + if err := hm.RemoveRODevice(layer, layer); err != nil { + t.Fatalf("unexpected error removing RO device %s: %s", layer, err) + } + } + if err := hm.RemoveRWDevice(scratchDir, scratchDir, true); err != nil { + t.Fatalf("unexpected error removing RW device %s: %s", scratchDir, err) + } +} + +func Test_HostMounts_Cannot_RemoveInUseDeviceByOverlay_MultipleUsers(t *testing.T) { + hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + + overlay1 := "/run/gcs/c/aaaa/rootfs" + overlay2 := "/run/gcs/c/bbbb/rootfs" + layers := []string{ + "/run/mounts/scsi/m1", + "/run/mounts/scsi/m2", + "/run/mounts/scsi/m3", + } + for _, layer := range layers { + if err := hm.AddRODevice(layer, layer); err != nil { + t.Fatalf("unexpected error adding RO device: %s", err) + } + } + sharedScratchMount := "/run/gcs/c/sandbox" + scratch1 := sharedScratchMount + "/scratch/aaaa" + scratch2 := sharedScratchMount + "/scratch/bbbb" + if err := hm.AddRWDevice(sharedScratchMount, sharedScratchMount, true); err != nil { + t.Fatalf("unexpected error adding RW device: %s", err) + } + if err := hm.AddOverlay(overlay1, layers, scratch1); err != nil { + t.Fatalf("unexpected error adding overlay1: %s", err) + } + + if err := hm.AddOverlay(overlay2, layers[0:2], scratch2); err != nil { + t.Fatalf("unexpected error adding overlay2: %s", err) + } + + for _, layer := range layers { + if err := hm.RemoveRODevice(layer, layer); err == nil { + t.Fatalf("expected error removing RO device %s while in use by overlay", layer) + } + } + if err := hm.RemoveRWDevice(sharedScratchMount, sharedScratchMount, true); err == nil { + t.Fatalf("expected error removing RW device %s while in use by overlay", sharedScratchMount) + } + + if _, err := hm.RemoveOverlay(overlay1); err != nil { + t.Fatalf("unexpected error removing overlay 1: %s", err) + } + + for _, layer := range layers[0:2] { + if err := hm.RemoveRODevice(layer, layer); err == nil { + t.Fatalf("expected error removing RO device %s (still in use by overlay 2)", layer) + } + } + if err := hm.RemoveRODevice(layers[2], layers[2]); err != nil { + t.Fatalf("unexpected error removing layers[2] which is not being used by overlay 2: %s", err) + } + if err := hm.RemoveRWDevice(sharedScratchMount, sharedScratchMount, true); err == nil { + t.Fatalf("expected error removing RW device %s while in use by overlay 2", scratch2) + } + + if _, err := hm.RemoveOverlay(overlay2); err != nil { + t.Fatalf("unexpected error removing overlay 2: %s", err) + } + for _, layer := range layers[0:2] { + if err := hm.RemoveRODevice(layer, layer); err != nil { + t.Fatalf("unexpected error removing RO device %s: %s", layer, err) + } + } + if err := hm.RemoveRWDevice(sharedScratchMount, sharedScratchMount, true); err != nil { + t.Fatalf("unexpected error removing RW device %s: %s", sharedScratchMount, err) + } +} From 003b0716ce503601484e2d188d1257dc9457671f Mon Sep 17 00:00:00 2001 From: Maksim An Date: Thu, 18 Jun 2026 23:14:43 -0700 Subject: [PATCH 4/6] gcs: tighten hcsv2 container lifecycle and mount checks Move container lifecycle tracking to explicit states (created, running, terminated) and use those states for container access, overlay usage, and confidential delete-state guards. Fix overlay mount path normalization so add/remove works reliably with clean vs unclean paths. Rename GetCreatedContainer to GetInitializedContainer and update callers/tests. Add lifecycle and path-normalization regression tests. Assisted-by: Github Copilot Signed-off-by: Maksim An --- internal/guest/bridge/bridge_v2.go | 4 +- internal/guest/runtime/hcsv2/container.go | 10 +- .../guest/runtime/hcsv2/lifecycle_test.go | 143 ++++++++++++++++++ internal/guest/runtime/hcsv2/process.go | 2 +- internal/guest/runtime/hcsv2/uvm.go | 26 ++-- internal/guest/runtime/hcsv2/uvm_state.go | 16 +- .../guest/runtime/hcsv2/uvm_state_test.go | 54 +++++++ test/gcs/container_test.go | 4 +- 8 files changed, 231 insertions(+), 28 deletions(-) create mode 100644 internal/guest/runtime/hcsv2/lifecycle_test.go diff --git a/internal/guest/bridge/bridge_v2.go b/internal/guest/bridge/bridge_v2.go index 748e9dcd9b..be2102998b 100644 --- a/internal/guest/bridge/bridge_v2.go +++ b/internal/guest/bridge/bridge_v2.go @@ -358,7 +358,7 @@ func (b *Bridge) waitOnProcessV2(r *Request) (_ RequestResponse, err error) { } exitCodeChan, doneChan = p.Wait() } else { - c, err := b.hostState.GetCreatedContainer(request.ContainerID) + c, err := b.hostState.GetInitializedContainer(request.ContainerID) if err != nil { return nil, err } @@ -404,7 +404,7 @@ func (b *Bridge) resizeConsoleV2(r *Request) (_ RequestResponse, err error) { trace.Int64Attribute("height", int64(request.Height)), trace.Int64Attribute("width", int64(request.Width))) - c, err := b.hostState.GetCreatedContainer(request.ContainerID) + c, err := b.hostState.GetInitializedContainer(request.ContainerID) if err != nil { return nil, err } diff --git a/internal/guest/runtime/hcsv2/container.go b/internal/guest/runtime/hcsv2/container.go index 15d416ae96..26924687d3 100644 --- a/internal/guest/runtime/hcsv2/container.go +++ b/internal/guest/runtime/hcsv2/container.go @@ -44,6 +44,11 @@ const ( // containerCreated is the status when a runtime container and init process // have been assigned, but runtime start command has not been issued yet containerCreated + // containerRunning is the status when the init process has started and has + // not yet exited. + containerRunning + // containerTerminated is the status when the init process has exited. + containerTerminated ) type Container struct { @@ -77,9 +82,6 @@ type Container struct { // and deal with the extra pointer dereferencing overhead. status atomic.Uint32 - // Set to true when the init process for the container has exited - terminated atomic.Bool - // scratchDirPath represents the path inside the UVM where the scratch directory // of this container is located. Usually, this is either `/run/gcs/c/` or // `/run/gcs/c//container_` if scratch is shared with UVM scratch. @@ -139,6 +141,8 @@ func (c *Container) Start(ctx context.Context, conSettings stdio.ConnectionSetti err = c.container.Start() if err != nil { stdioSet.Close() + } else { + c.setStatus(containerRunning) } return int(c.initProcess.pid), err } diff --git a/internal/guest/runtime/hcsv2/lifecycle_test.go b/internal/guest/runtime/hcsv2/lifecycle_test.go new file mode 100644 index 0000000000..3f2a7420e1 --- /dev/null +++ b/internal/guest/runtime/hcsv2/lifecycle_test.go @@ -0,0 +1,143 @@ +//go:build linux +// +build linux + +package hcsv2 + +import ( + "context" + "io" + "strings" + "testing" + + oci "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/Microsoft/hcsshim/pkg/securitypolicy" +) + +type fakeConfidentialPolicyEnforcer struct { + *securitypolicy.OpenDoorSecurityPolicyEnforcer +} + +func (f *fakeConfidentialPolicyEnforcer) EncodedSecurityPolicy() string { + return "test-policy" +} + +func newConfidentialHostForTest() *Host { + h := NewHost(nil, nil, &securitypolicy.OpenDoorSecurityPolicyEnforcer{}, io.Discard) + h.securityOptions = securitypolicy.NewSecurityOptions( + &fakeConfidentialPolicyEnforcer{OpenDoorSecurityPolicyEnforcer: &securitypolicy.OpenDoorSecurityPolicyEnforcer{}}, + false, + "", + "", + io.Discard, + ) + return h +} + +func newTestContainer(id, rootfs string, st containerStatus) *Container { + c := &Container{ + id: id, + spec: &oci.Spec{Root: &oci.Root{Path: rootfs}}, + } + c.setStatus(st) + return c +} + +func TestGetInitializedContainer_StateGating(t *testing.T) { + h := &Host{containers: map[string]*Container{}} + + for _, tc := range []struct { + name string + status containerStatus + wantError bool + }{ + {name: "creating denied", status: containerCreating, wantError: true}, + {name: "created allowed", status: containerCreated, wantError: false}, + {name: "running allowed", status: containerRunning, wantError: false}, + {name: "terminated allowed", status: containerTerminated, wantError: false}, + } { + t.Run(tc.name, func(t *testing.T) { + id := "cid-" + tc.name + h.containers = map[string]*Container{id: newTestContainer(id, "/rootfs", tc.status)} + + got, err := h.GetInitializedContainer(id) + if tc.wantError { + if err == nil { + t.Fatalf("expected error for status %v", tc.status) + } + return + } + if err != nil { + t.Fatalf("unexpected error for status %v: %v", tc.status, err) + } + if got == nil { + t.Fatalf("expected non-nil container for status %v", tc.status) + } + }) + } +} + +func TestIsOverlayInUse_OnlyRunningContainersCount(t *testing.T) { + overlay := "/run/gcs/c/abc/rootfs" + + h := &Host{ + containers: map[string]*Container{ + "created": newTestContainer("created", overlay, containerCreated), + "terminated": newTestContainer("terminated", overlay, containerTerminated), + "other": newTestContainer("other", "/run/gcs/c/xyz/rootfs", containerRunning), + }, + } + + if h.IsOverlayInUse(overlay) { + t.Fatalf("expected overlay %q not in use when no running container uses it", overlay) + } + + h.containers["running"] = newTestContainer("running", overlay, containerRunning) + if !h.IsOverlayInUse(overlay) { + t.Fatalf("expected overlay %q in use when running container uses it", overlay) + } +} + +func TestDeleteContainerState_DeniesRunningContainerInConfidentialMode(t *testing.T) { + h := newConfidentialHostForTest() + id := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + h.containers[id] = newTestContainer(id, "/run/gcs/c/abc/rootfs", containerRunning) + + err := h.DeleteContainerState(context.Background(), id) + if err == nil { + t.Fatal("expected error deleting running container state") + } + if !strings.Contains(err.Error(), "Denied deleting state of a running container") { + t.Fatalf("unexpected error: %v", err) + } + if _, ok := h.containers[id]; !ok { + t.Fatalf("container %q should remain registered on early deny", id) + } +} + +func TestDeleteContainerState_DeniesIfOverlayStillMountedInConfidentialMode(t *testing.T) { + h := newConfidentialHostForTest() + h.hostMounts = newHostMounts() + + id := "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + overlay := "/run/gcs/c/def/rootfs" + h.containers[id] = newTestContainer(id, overlay, containerCreated) + + h.hostMounts.Lock() + if err := h.hostMounts.AddOverlay(overlay, nil, ""); err != nil { + h.hostMounts.Unlock() + t.Fatalf("failed to seed overlay mount: %v", err) + } + h.hostMounts.Unlock() + + err := h.DeleteContainerState(context.Background(), id) + if err == nil { + t.Fatal("expected error deleting state with active overlay") + } + if !strings.Contains(err.Error(), "overlay mount still active") { + t.Fatalf("unexpected error: %v", err) + } + if _, ok := h.containers[id]; !ok { + t.Fatalf("container %q should remain registered on early deny", id) + } +} diff --git a/internal/guest/runtime/hcsv2/process.go b/internal/guest/runtime/hcsv2/process.go index b4a9b60f72..ce364d9b4f 100644 --- a/internal/guest/runtime/hcsv2/process.go +++ b/internal/guest/runtime/hcsv2/process.go @@ -100,7 +100,7 @@ func newProcess(c *Container, spec *oci.Process, process runtime.Process, pid ui } p.exitCode = exitCode if p.init { - c.terminated.Store(true) + c.setStatus(containerTerminated) } log.G(ctx).WithField("exitCode", p.exitCode).Debug("process exited") diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 63545fa7ef..20e2c0eb6d 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -17,7 +17,6 @@ import ( "regexp" "strings" "sync" - "sync/atomic" "syscall" "time" @@ -298,7 +297,7 @@ func (h *Host) RemoveContainer(id string) { } } -func (h *Host) GetCreatedContainer(id string) (*Container, error) { +func (h *Host) GetInitializedContainer(id string) (*Container, error) { h.containersMutex.Lock() defer h.containersMutex.Unlock() @@ -306,8 +305,8 @@ func (h *Host) GetCreatedContainer(id string) (*Container, error) { if !ok { return nil, gcserr.NewHresultError(gcserr.HrVmcomputeSystemNotFound) } - if c.getStatus() != containerCreated { - return nil, fmt.Errorf("container is not in state \"created\": %w", + if c.getStatus() == containerCreating { + return nil, fmt.Errorf("container is still being created: %w", gcserr.NewHresultError(gcserr.HrVmcomputeInvalidState)) } return c, nil @@ -506,7 +505,6 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM isSandbox: criType == "sandbox", exitType: prot.NtUnexpectedExit, processes: make(map[uint32]*containerProcess), - terminated: atomic.Bool{}, scratchDirPath: settings.ScratchDirPath, } c.setStatus(containerCreating) @@ -822,7 +820,7 @@ func (h *Host) IsOverlayInUse(overlayPath string) bool { defer h.containersMutex.Unlock() for _, c := range h.containers { - if c.terminated.Load() { + if c.getStatus() != containerRunning { continue } @@ -881,7 +879,7 @@ func (h *Host) modifyHostSettings(ctx context.Context, containerID string, req * case guestresource.ResourceTypeVPCIDevice: return modifyMappedVPCIDevice(ctx, req.RequestType, req.Settings.(*guestresource.LCOWMappedVPCIDevice)) case guestresource.ResourceTypeContainerConstraints: - c, err := h.GetCreatedContainer(containerID) + c, err := h.GetInitializedContainer(containerID) if err != nil { return err } @@ -926,7 +924,7 @@ func (h *Host) modifyContainerSettings(ctx context.Context, containerID string, } } - c, err := h.GetCreatedContainer(containerID) + c, err := h.GetInitializedContainer(containerID) if err != nil { return err } @@ -954,7 +952,7 @@ func (*Host) Shutdown() { // Called to shutdown a container func (h *Host) ShutdownContainer(ctx context.Context, containerID string, graceful bool) error { - c, err := h.GetCreatedContainer(containerID) + c, err := h.GetInitializedContainer(containerID) if err != nil { return err } @@ -973,7 +971,7 @@ func (h *Host) ShutdownContainer(ctx context.Context, containerID string, gracef } func (h *Host) SignalContainerProcess(ctx context.Context, containerID string, processID uint32, signal syscall.Signal) error { - c, err := h.GetCreatedContainer(containerID) + c, err := h.GetInitializedContainer(containerID) if err != nil { return err } @@ -1026,7 +1024,7 @@ func (h *Host) ExecProcess(ctx context.Context, containerID string, params prot. tport = h.devNullTransport } pid, err = h.runExternalProcess(ctx, params, conSettings, tport) - } else if c, err = h.GetCreatedContainer(containerID); err == nil { + } else if c, err = h.GetInitializedContainer(containerID); err == nil { // We found a V2 container. Treat this as a V2 process. if params.OCIProcess == nil { // We've already done policy enforcement for creating a container so @@ -1102,7 +1100,7 @@ func (h *Host) GetProperties(ctx context.Context, containerID string, query prot return nil, errors.Wrapf(err, "get properties denied due to policy") } - c, err := h.GetCreatedContainer(containerID) + c, err := h.GetInitializedContainer(containerID) if err != nil { return nil, err } @@ -1870,12 +1868,12 @@ func (h *Host) DeleteContainerState(ctx context.Context, containerID string) err return err } - c, err := h.GetCreatedContainer(containerID) + c, err := h.GetInitializedContainer(containerID) if err != nil { return err } if h.HasSecurityPolicy() { - if !c.terminated.Load() { + if c.getStatus() == containerRunning { return errors.Errorf("Denied deleting state of a running container %q", containerID) } overlay := c.spec.Root.Path diff --git a/internal/guest/runtime/hcsv2/uvm_state.go b/internal/guest/runtime/hcsv2/uvm_state.go index f7d347cbf4..090fb039eb 100644 --- a/internal/guest/runtime/hcsv2/uvm_state.go +++ b/internal/guest/runtime/hcsv2/uvm_state.go @@ -248,7 +248,9 @@ func (hm *hostMounts) AddRWDevice(mountPath string, sourcePath string, encrypted func (hm *hostMounts) AddOverlay(mountPath string, layers []string, scratchDir string) (err error) { hm.expectLocked() - dev, err := hm.doAddDevice(mountPath, DeviceTypeOverlay, mountPath) + cleanMountPath := filepath.Clean(mountPath) + + dev, err := hm.doAddDevice(cleanMountPath, DeviceTypeOverlay, cleanMountPath) if err != nil { return err } @@ -260,7 +262,7 @@ func (hm *hostMounts) AddOverlay(mountPath string, layers []string, scratchDir s for _, d := range dev.referencedDevices { hm.releaseDeviceUsage(d) } - delete(hm.devices, mountPath) + delete(hm.devices, dev.mountPath) } }() @@ -306,8 +308,10 @@ func (hm *hostMounts) RemoveRWDevice(mountPath string, sourcePath string, encryp func (hm *hostMounts) RemoveOverlay(mountPath string) (undo func(), err error) { hm.expectLocked() + cleanMountPath := filepath.Clean(mountPath) + var dev *device - err = hm.doRemoveDevice(mountPath, DeviceTypeOverlay, mountPath, func(_dev *device) error { + err = hm.doRemoveDevice(cleanMountPath, DeviceTypeOverlay, cleanMountPath, func(_dev *device) error { dev = _dev for _, refDev := range dev.referencedDevices { hm.releaseDeviceUsage(refDev) @@ -326,14 +330,14 @@ func (hm *hostMounts) RemoveOverlay(mountPath string) (undo func(), err error) { refDev.usage++ } - if _, ok := hm.devices[mountPath]; ok { - log.G(context.Background()).WithField("mountPath", mountPath).Error( + if _, ok := hm.devices[cleanMountPath]; ok { + log.G(context.Background()).WithField("mountPath", cleanMountPath).Error( "hostMounts::RemoveOverlay: failed to undo remove: device that was removed exists in map", ) return } - hm.devices[mountPath] = dev + hm.devices[cleanMountPath] = dev } return undo, nil } diff --git a/internal/guest/runtime/hcsv2/uvm_state_test.go b/internal/guest/runtime/hcsv2/uvm_state_test.go index e87a207308..23dd1e50c3 100644 --- a/internal/guest/runtime/hcsv2/uvm_state_test.go +++ b/internal/guest/runtime/hcsv2/uvm_state_test.go @@ -323,3 +323,57 @@ func Test_HostMounts_Cannot_RemoveInUseDeviceByOverlay_MultipleUsers(t *testing. t.Fatalf("unexpected error removing RW device %s: %s", sharedScratchMount, err) } } + +func Test_HostMounts_RemoveOverlay_PathNormalization_AddUncleanRemoveClean(t *testing.T) { + hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + + layer := "/run/mounts/scsi/m1" + if err := hm.AddRODevice(layer, layer); err != nil { + t.Fatalf("unexpected error adding RO device: %s", err) + } + + scratch := "/run/gcs/c/aaaa/scratch" + if err := hm.AddRWDevice(scratch, scratch, false); err != nil { + t.Fatalf("unexpected error adding RW device: %s", err) + } + + uncleanOverlay := "/run/gcs/c/aaaa/./rootfs" + cleanOverlay := "/run/gcs/c/aaaa/rootfs" + + if err := hm.AddOverlay(uncleanOverlay, []string{layer}, scratch); err != nil { + t.Fatalf("unexpected error adding overlay with unclean path: %s", err) + } + + if _, err := hm.RemoveOverlay(cleanOverlay); err != nil { + t.Fatalf("expected removing overlay with clean path to succeed after add with unclean path, got error: %s", err) + } +} + +func Test_HostMounts_RemoveOverlay_PathNormalization_AddCleanRemoveUnclean(t *testing.T) { + hm := newHostMounts() + hm.Lock() + defer hm.Unlock() + + layer := "/run/mounts/scsi/m1" + if err := hm.AddRODevice(layer, layer); err != nil { + t.Fatalf("unexpected error adding RO device: %s", err) + } + + scratch := "/run/gcs/c/aaaa/scratch" + if err := hm.AddRWDevice(scratch, scratch, false); err != nil { + t.Fatalf("unexpected error adding RW device: %s", err) + } + + cleanOverlay := "/run/gcs/c/aaaa/rootfs" + uncleanOverlay := "/run/gcs/c/aaaa/./rootfs" + + if err := hm.AddOverlay(cleanOverlay, []string{layer}, scratch); err != nil { + t.Fatalf("unexpected error adding overlay with clean path: %s", err) + } + + if _, err := hm.RemoveOverlay(uncleanOverlay); err != nil { + t.Fatalf("expected removing overlay with unclean path to succeed after add with clean path, got error: %s", err) + } +} diff --git a/test/gcs/container_test.go b/test/gcs/container_test.go index 4f3db015c0..aa4cf6ef10 100644 --- a/test/gcs/container_test.go +++ b/test/gcs/container_test.go @@ -77,9 +77,9 @@ func TestContainerDelete(t *testing.T) { cleanupContainer(ctx, t, host, c) - _, err := host.GetCreatedContainer(id) + _, err := host.GetInitializedContainer(id) if hr, herr := gcserr.GetHresult(err); herr != nil || hr != gcserr.HrVmcomputeSystemNotFound { - t.Fatalf("GetCreatedContainer returned %v, wanted %v", err, gcserr.HrVmcomputeSystemNotFound) + t.Fatalf("GetInitializedContainer returned %v, wanted %v", err, gcserr.HrVmcomputeSystemNotFound) } assertNumberContainers(ctx, t, rtime, 0) } From 72336d23ffeb1840f1b67e7b2e8d144be8e8c049 Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Tue, 11 Nov 2025 16:55:12 +0000 Subject: [PATCH 5/6] Merged PR 13627088: guest: Don't allow host to set mount options [cherry-picked from 1dd0b7ea0b0f91d3698f6008fb0bd5b0de777da6] Blocks mount option passing for 9p (which is accidental) and SCSI disks. - guest: Restrict plan9 share names to digits only on Confidential mode - hcsv2/uvm: Restrict SCSI mount options in confidential mode (The only one we allow is `ro`) Related work items: #34370380 Signed-off-by: Tingmao Wang --- internal/guest/runtime/hcsv2/uvm.go | 20 ++++++++++++++++++++ internal/guest/storage/plan9/plan9.go | 14 ++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index 20e2c0eb6d..c8ef075e83 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -1326,6 +1326,20 @@ func (h *Host) modifyMappedVirtualDisk( mountCtx, cancel := context.WithTimeout(ctx, time.Second*5) defer cancel() if mvd.MountPath != "" { + if h.HasSecurityPolicy() { + // The only option we allow if there is policy enforcement is + // "ro", and it must be present iff the request is readonly. + expectedOptions := []string{} + if mvd.ReadOnly { + expectedOptions = []string{"ro"} + } + if !slices.Equal(mvd.Options, expectedOptions) { + return errors.Errorf( + "mounting scsi device controller %d lun %d onto %s with mount options %q denied by policy: expected %q (mvd.ReadOnly=%t)", + mvd.Controller, mvd.Lun, mvd.MountPath, strings.Join(mvd.Options, ","), strings.Join(expectedOptions, ","), mvd.ReadOnly, + ) + } + } if mvd.ReadOnly { var deviceHash string if verityInfo != nil { @@ -1487,6 +1501,12 @@ func (h *Host) modifyMappedDirectory( return errors.Wrapf(err, "mounting plan9 device at %s denied by policy", md.MountPath) } + if h.HasSecurityPolicy() { + if err = plan9.ValidateShareName(md.ShareName); err != nil { + return err + } + } + // Similar to the reasoning in modifyMappedVirtualDisk, since we're // rolling back the policy metadata, plan9.Mount here must clean up // everything if it fails, which it does do. diff --git a/internal/guest/storage/plan9/plan9.go b/internal/guest/storage/plan9/plan9.go index 5c1f1d74f4..44ac0f4e4e 100644 --- a/internal/guest/storage/plan9/plan9.go +++ b/internal/guest/storage/plan9/plan9.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "os" + "regexp" "syscall" "github.com/Microsoft/hcsshim/internal/guest/transport" @@ -25,6 +26,19 @@ var ( unixMount = unix.Mount ) +// c.f. v9fs_parse_options in linux/fs/9p/v9fs.c - technically anything other +// than ',' is ok (quoting is not handled), however, this name is generated from +// a counter in AddPlan9 (internal/uvm/plan9.go), and therefore we expect only +// digits from a normal hcsshim host. +var validShareNameRegex = regexp.MustCompile(`^[0-9]+$`) + +func ValidateShareName(name string) error { + if !validShareNameRegex.MatchString(name) { + return fmt.Errorf("invalid plan9 share name %q: must match regex %q", name, validShareNameRegex.String()) + } + return nil +} + // Mount dials a connection from `vsock` and mounts a Plan9 share to `target`. // // `target` will be created. On mount failure the created `target` will be From 15c2981ba74bb371cb38db6a0b350afe4c67862a Mon Sep 17 00:00:00 2001 From: Tingmao Wang Date: Wed, 4 Mar 2026 14:32:21 +0000 Subject: [PATCH 6/6] Merged PR 13694574: hcsv2/uvm, rego: Enforce OCI spec does not contain any LinuxDevices [cherry-picked from 9f69e49f3..72b338a99] [forward-ported from original PR 13653011 (e631aa42c6039ab0d228b746b280c26809433f00)] See https://msazure.visualstudio.com/ContainerPlatform/_git/Microsoft.hcsshim/pullrequest/13653011 Signed-off-by: Tingmao Wang --- internal/guest/runtime/hcsv2/uvm.go | 13 ++++++++ pkg/securitypolicy/framework.rego | 16 +++++++++ pkg/securitypolicy/regopolicy_linux_test.go | 33 +++++++++++++++++++ pkg/securitypolicy/securitypolicyenforcer.go | 1 + .../securitypolicyenforcer_rego.go | 25 ++++++++++++++ 5 files changed, 88 insertions(+) diff --git a/internal/guest/runtime/hcsv2/uvm.go b/internal/guest/runtime/hcsv2/uvm.go index c8ef075e83..c7a6a125d6 100644 --- a/internal/guest/runtime/hcsv2/uvm.go +++ b/internal/guest/runtime/hcsv2/uvm.go @@ -15,6 +15,7 @@ import ( "path" "path/filepath" "regexp" + "slices" "strings" "sync" "syscall" @@ -529,6 +530,17 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM h.RemoveContainer(id) } }() + // Take a backup of the devices array before we populate it with any devices + // found by GCS, in order to pass to the policy enforcer later. + // + // In specGuest.ApplyAnnotationsToSpec, if this is a privileged container, + // we will add devices found in the GCS namespace's /dev. Regardless of + // privileged or not, we also always include /dev/sev-guest. Since the + // policy already lets the user enforce whether the container should be + // privileged or not, and the sev-guest device is always added for a + // confidential container, we do not need the policy enforcer to check these + // devices we dynamically add again. + extraLinuxDevices := slices.Clone(settings.OCISpecification.Linux.Devices) var namespaceID string if isCRI { @@ -671,6 +683,7 @@ func (h *Host) CreateContainer(ctx context.Context, id string, settings *prot.VM Capabilities: settings.OCISpecification.Process.Capabilities, SeccompProfileSHA256: seccomp, IsSandboxContainer: c.isSandbox, + LinuxDevices: extraLinuxDevices, } envToKeep, capsToKeep, allowStdio, err := h.securityOptions.PolicyEnforcer.EnforceCreateContainerPolicyV2( ctx, diff --git a/pkg/securitypolicy/framework.rego b/pkg/securitypolicy/framework.rego index 76e5b048a0..6af20467be 100644 --- a/pkg/securitypolicy/framework.rego +++ b/pkg/securitypolicy/framework.rego @@ -388,6 +388,14 @@ seccomp_ok(seccomp_profile_sha256) { is_windows } +devices_ok(expected_devices, actual_devices) { + # Allow out of order but not duplicates + set_expected := {dev | dev := expected_devices[_]} + set_actual := {dev | dev := actual_devices[_]} + set_expected == set_actual + count(set_actual) == count(actual_devices) +} + default container_started := false container_started { @@ -599,6 +607,8 @@ create_container := {"metadata": [updateMatches, addStarted], command_ok(container.command) mountList_ok(container.mounts, container.allow_elevated) seccomp_ok(container.seccomp_profile_sha256) + # We do not support adding device nodes to the policy yet + devices_ok([], input.devices) ] count(possible_after_initial_containers) > 0 @@ -2124,6 +2134,12 @@ errors["capabilities don't match"] { count(possible_after_caps_containers) == 0 } +errors["devices not supported"] { + is_linux + input.rule == "create_container" + not devices_ok([], input.devices) +} + # covers exec_in_container as well. it shouldn't be possible to ever get # an exec_in_container as it "inherits" capabilities rules from create_container errors["containers only distinguishable by capabilties"] { diff --git a/pkg/securitypolicy/regopolicy_linux_test.go b/pkg/securitypolicy/regopolicy_linux_test.go index 51da87a18c..e35883edcb 100644 --- a/pkg/securitypolicy/regopolicy_linux_test.go +++ b/pkg/securitypolicy/regopolicy_linux_test.go @@ -2151,6 +2151,39 @@ func Test_Rego_EnforceCreateContainer_Capabilities_Drop_NoMatches(t *testing.T) } } +func Test_Rego_EnforceCreateContainer_RequireNoDevices(t *testing.T) { + f := func(p *generatedConstraints) bool { + tc, err := setupSimpleRegoCreateContainerTest(p) + if err != nil { + t.Error(err) + return false + } + + privileged := false + + _, _, _, err = tc.policy.EnforceCreateContainerPolicyV2(p.ctx, tc.containerID, tc.argList, tc.envList, tc.workingDir, tc.mounts, tc.user, &CreateContainerOptions{ + SandboxID: tc.sandboxID, + Privileged: &privileged, + NoNewPrivileges: &tc.noNewPrivileges, + Groups: tc.groups, + Umask: tc.umask, + Capabilities: tc.capabilities, + SeccompProfileSHA256: tc.seccomp, + LinuxDevices: []oci.LinuxDevice{ + { + Path: "/test", + }, + }, + }) + + return assertDecisionJSONContains(t, err, "devices not supported") + } + + if err := quick.Check(f, &quick.Config{MaxCount: 50, Rand: testRand}); err != nil { + t.Errorf("Test_Rego_EnforceCreateContainer_RequireNoDevices: %v", err) + } +} + func Test_Rego_ExtendDefaultMounts(t *testing.T) { f := func(p *generatedConstraints) bool { tc, err := setupSimpleRegoCreateContainerTest(p) diff --git a/pkg/securitypolicy/securitypolicyenforcer.go b/pkg/securitypolicy/securitypolicyenforcer.go index 0c2a98e998..4645220c95 100644 --- a/pkg/securitypolicy/securitypolicyenforcer.go +++ b/pkg/securitypolicy/securitypolicyenforcer.go @@ -32,6 +32,7 @@ type CreateContainerOptions struct { // IsSandboxContainer is true when the container being created is the cri // pod sandbox container (usually it is the "pause" image). IsSandboxContainer bool + LinuxDevices []oci.LinuxDevice } type SignalContainerOptions struct { IsInitProcess bool diff --git a/pkg/securitypolicy/securitypolicyenforcer_rego.go b/pkg/securitypolicy/securitypolicyenforcer_rego.go index 5e196ebd9a..54f68b9adf 100644 --- a/pkg/securitypolicy/securitypolicyenforcer_rego.go +++ b/pkg/securitypolicy/securitypolicyenforcer_rego.go @@ -714,6 +714,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicy( Capabilities: capabilities, SeccompProfileSHA256: seccompProfileSHA256, IsSandboxContainer: false, + LinuxDevices: []oci.LinuxDevice{}, } return policy.EnforceCreateContainerPolicyV2(ctx, containerID, argList, envList, workingDir, mounts, user, opts) } @@ -751,6 +752,7 @@ func (policy *regoEnforcer) EnforceCreateContainerPolicyV2( "sandboxDir": SandboxMountsDir(opts.SandboxID), "hugePagesDir": HugePagesMountsDir(opts.SandboxID), "mounts": appendMountData([]interface{}{}, mounts), + "devices": appendDeviceData([]interface{}{}, opts.LinuxDevices), "privileged": opts.Privileged, "noNewPrivileges": opts.NoNewPrivileges, "user": user.toInput(), @@ -840,6 +842,29 @@ func appendMountData(mountData []interface{}, mounts []oci.Mount) []interface{} return mountData } +func uint32ptrtoany(i *uint32) interface{} { + if i == nil { + return nil + } + return *i +} + +func appendDeviceData(deviceData []interface{}, devices []oci.LinuxDevice) []interface{} { + for _, device := range devices { + deviceData = append(deviceData, inputData{ + "path": device.Path, + "type": device.Type, + "major": device.Major, + "minor": device.Minor, + "fileMode": device.FileMode, + "uid": uint32ptrtoany(device.UID), + "gid": uint32ptrtoany(device.GID), + }) + } + + return deviceData +} + func (policy *regoEnforcer) ExtendDefaultMounts(mounts []oci.Mount) error { policy.defaultMounts = append(policy.defaultMounts, mounts...) defaultMounts := appendMountData([]interface{}{}, policy.defaultMounts)