chore: log provider stack traces on text file busy (#15249 )

re: #14726 If we see "text file busy" in the errors while initializing terraform, attempt to query the pprof endpoint set up by https://github.com/coder/terraform-provider-coder/pull/295 and log at CRITICAL. --------- Signed-off-by: Spike Curtis <spike@coder.com> (cherry picked from commit d676ad56fe) Co-authored-by: Spike Curtis <spike@coder.com>
fix(site): sanitize login redirect (#15208 ) (#15220 )
2024-10-28 10:59:02 -05:00 · 2024-10-24 20:49:03 +01:00 · 2024-10-15 16:20:22 -05:00 · 2024-10-08 12:52:06 -05:00 · 2024-10-08 12:02:05 -05:00 · 2024-10-05 21:54:01 +05:00
23 changed files with 536 additions and 104 deletions
@@ -1669,13 +1669,12 @@ func (a *agent) manageProcessPriority(ctx context.Context, debouncer *logDebounc
 		}

 		score, niceErr := proc.Niceness(a.syscaller)
-		if niceErr != nil && !xerrors.Is(niceErr, os.ErrPermission) {
+		if niceErr != nil && !isBenignProcessErr(niceErr) {
 			debouncer.Warn(ctx, "unable to get proc niceness",
 				slog.F("cmd", proc.Cmd()),
 				slog.F("pid", proc.PID),
 				slog.Error(niceErr),
 			)
-			continue
 		}

 		// We only want processes that don't have a nice value set
@@ -1689,7 +1688,7 @@ func (a *agent) manageProcessPriority(ctx context.Context, debouncer *logDebounc

 		if niceErr == nil {
 			err := proc.SetNiceness(a.syscaller, niceness)
-			if err != nil && !xerrors.Is(err, os.ErrPermission) {
+			if err != nil && !isBenignProcessErr(err) {
 				debouncer.Warn(ctx, "unable to set proc niceness",
 					slog.F("cmd", proc.Cmd()),
 					slog.F("pid", proc.PID),
@@ -1703,7 +1702,7 @@ func (a *agent) manageProcessPriority(ctx context.Context, debouncer *logDebounc
 		if oomScore != unsetOOMScore && oomScore != proc.OOMScoreAdj && !isCustomOOMScore(agentScore, proc) {
 			oomScoreStr := strconv.Itoa(oomScore)
 			err := afero.WriteFile(a.filesystem, fmt.Sprintf("/proc/%d/oom_score_adj", proc.PID), []byte(oomScoreStr), 0o644)
-			if err != nil && !xerrors.Is(err, os.ErrPermission) {
+			if err != nil && !isBenignProcessErr(err) {
 				debouncer.Warn(ctx, "unable to set oom_score_adj",
 					slog.F("cmd", proc.Cmd()),
 					slog.F("pid", proc.PID),
@@ -2139,3 +2138,14 @@ func (l *logDebouncer) log(ctx context.Context, level slog.Level, msg string, fi
 	}
 	l.messages[msg] = time.Now()
 }
+
+func isBenignProcessErr(err error) bool {
+	return err != nil &&
+		(xerrors.Is(err, os.ErrNotExist) ||
+			xerrors.Is(err, os.ErrPermission) ||
+			isNoSuchProcessErr(err))
+}
+
+func isNoSuchProcessErr(err error) bool {
+	return err != nil && strings.Contains(err.Error(), "no such process")
+}
@@ -45,8 +45,7 @@ func List(fs afero.Fs, syscaller Syscaller) ([]*Process, error) {

 		cmdline, err := afero.ReadFile(fs, filepath.Join(defaultProcDir, entry, "cmdline"))
 		if err != nil {
-			var errNo syscall.Errno
-			if xerrors.As(err, &errNo) && errNo == syscall.EPERM {
+			if isBenignError(err) {
 				continue
 			}
 			return nil, xerrors.Errorf("read cmdline: %w", err)
@@ -54,7 +53,7 @@ func List(fs afero.Fs, syscaller Syscaller) ([]*Process, error) {

 		oomScore, err := afero.ReadFile(fs, filepath.Join(defaultProcDir, entry, "oom_score_adj"))
 		if err != nil {
-			if xerrors.Is(err, os.ErrPermission) {
+			if isBenignError(err) {
 				continue
 			}

@@ -124,3 +123,12 @@ func (p *Process) Cmd() string {
 func (p *Process) cmdLine() []string {
 	return strings.Split(p.CmdLine, "\x00")
 }
+
+func isBenignError(err error) bool {
+	var errno syscall.Errno
+	if !xerrors.As(err, &errno) {
+		return false
+	}
+
+	return errno == syscall.ESRCH || errno == syscall.EPERM || xerrors.Is(err, os.ErrNotExist)
+}
@@ -7,7 +7,6 @@ import (
 	"github.com/google/uuid"
 	"github.com/stretchr/testify/require"

-	"github.com/coder/coder/v2/coderd/coderdtest"
 	"github.com/coder/coder/v2/codersdk"
 )

@@ -109,9 +108,9 @@ func TestCreateUserRequestJSON(t *testing.T) {
 		t.Parallel()

 		req := codersdk.CreateUserRequestWithOrgs{
-			Email:           coderdtest.RandomName(t),
-			Username:        coderdtest.RandomName(t),
-			Name:            coderdtest.RandomName(t),
+			Email:           "alice@coder.com",
+			Username:        "alice",
+			Name:            "Alice User",
 			Password:        "",
 			UserLoginType:   codersdk.LoginTypePassword,
 			OrganizationIDs: []uuid.UUID{uuid.New(), uuid.New()},
@@ -123,9 +122,9 @@ func TestCreateUserRequestJSON(t *testing.T) {
 		t.Parallel()

 		req := codersdk.CreateUserRequestWithOrgs{
-			Email:           coderdtest.RandomName(t),
-			Username:        coderdtest.RandomName(t),
-			Name:            coderdtest.RandomName(t),
+			Email:           "alice@coder.com",
+			Username:        "alice",
+			Name:            "Alice User",
 			Password:        "",
 			UserLoginType:   codersdk.LoginTypePassword,
 			OrganizationIDs: []uuid.UUID{uuid.New()},
@@ -137,9 +136,9 @@ func TestCreateUserRequestJSON(t *testing.T) {
 		t.Parallel()

 		req := codersdk.CreateUserRequestWithOrgs{
-			Email:           coderdtest.RandomName(t),
-			Username:        coderdtest.RandomName(t),
-			Name:            coderdtest.RandomName(t),
+			Email:           "alice@coder.com",
+			Username:        "alice",
+			Name:            "Alice User",
 			Password:        "",
 			UserLoginType:   codersdk.LoginTypePassword,
 			OrganizationIDs: []uuid.UUID{},
@@ -77,9 +77,9 @@ can only be delivered to one method, and this method is configured globally with
 [`CODER_NOTIFICATIONS_METHOD`](https://coder.com/docs/reference/cli/server#--notifications-method)
 (default: `smtp`).

-Enterprise customers can configured which method to use for each of the
-supported [Events](#events); see the [Preferences](#preferences) section below
-for more details.
+Enterprise customers can configure which method to use for each of the supported
+[Events](#events); see the [Preferences](#preferences) section below for more
+details.

 ## SMTP (Email)

@@ -93,7 +93,7 @@ existing one.
 | :------: | --------------------------------- | ------------------------------------- | ----------- | ----------------------------------------- | ------------- |
 |    ✔️    | `--notifications-email-from`      | `CODER_NOTIFICATIONS_EMAIL_FROM`      | `string`    | The sender's address to use.              |               |
 |    ✔️    | `--notifications-email-smarthost` | `CODER_NOTIFICATIONS_EMAIL_SMARTHOST` | `host:port` | The SMTP relay to send messages through.  | localhost:587 |
-|    -️    | `--notifications-email-hello`     | `CODER_NOTIFICATIONS_EMAIL_HELLO`     | `string`    | The hostname identifying the SMTP server. | localhost     |
+|    ✔️    | `--notifications-email-hello`     | `CODER_NOTIFICATIONS_EMAIL_HELLO`     | `string`    | The hostname identifying the SMTP server. | localhost     |

 **Authentication Settings:**

@@ -252,6 +252,18 @@ To pause sending notifications, execute
 To resume sending notifications, execute
 [`coder notifications resume`](https://coder.com/docs/reference/cli/notifications_resume).

+## Troubleshooting
+
+If notifications are not being delivered, use the following methods to
+troubleshoot:
+
+1. Ensure notifications are being added to the `notification_messages` table
+2. Review any error messages in the `status_reason` column, should an error have
+   occurred
+3. Review the logs (search for the term `notifications`) for diagnostic
+   information<br> _If you do not see any relevant logs, set
+   `CODER_VERBOSE=true` or `--verbose` to output debug logs_
+
 ## Internals

 The notification system is built to operate concurrently in a single- or
@@ -288,5 +300,4 @@ messages._
  - after `CODER_NOTIFICATIONS_MAX_SEND_ATTEMPTS` is exceeded, it transitions to
    `permanent_failure`

-Diagnostic messages will be saved in the `notification_messages` table and will
-be logged, in the case of failure.
+See [Troubleshooting](#troubleshooting) above for more details.
@@ -174,6 +174,10 @@ type LicenseOptions struct {
 	// ExpiresAt is the time at which the license will hard expire.
 	// ExpiresAt should always be greater then GraceAt.
 	ExpiresAt time.Time
+	// NotBefore is the time at which the license becomes valid. If set to the
+	// zero value, the `nbf` claim on the license is set to 1 minute in the
+	// past.
+	NotBefore time.Time
 	Features  license.Features
 }

@@ -195,6 +199,13 @@ func (opts *LicenseOptions) Valid(now time.Time) *LicenseOptions {
 	return opts
 }

+func (opts *LicenseOptions) FutureTerm(now time.Time) *LicenseOptions {
+	opts.NotBefore = now.Add(time.Hour * 24)
+	opts.ExpiresAt = now.Add(time.Hour * 24 * 60)
+	opts.GraceAt = now.Add(time.Hour * 24 * 53)
+	return opts
+}
+
 func (opts *LicenseOptions) UserLimit(limit int64) *LicenseOptions {
 	return opts.Feature(codersdk.FeatureUserLimit, limit)
 }
@@ -233,13 +244,16 @@ func GenerateLicense(t *testing.T, options LicenseOptions) string {
 	if options.GraceAt.IsZero() {
 		options.GraceAt = time.Now().Add(time.Hour)
 	}
+	if options.NotBefore.IsZero() {
+		options.NotBefore = time.Now().Add(-time.Minute)
+	}

 	c := &license.Claims{
 		RegisteredClaims: jwt.RegisteredClaims{
 			ID:        uuid.NewString(),
 			Issuer:    "test@testing.test",
 			ExpiresAt: jwt.NewNumericDate(options.ExpiresAt),
-			NotBefore: jwt.NewNumericDate(time.Now().Add(-time.Minute)),
+			NotBefore: jwt.NewNumericDate(options.NotBefore),
 			IssuedAt:  jwt.NewNumericDate(time.Now().Add(-time.Minute)),
 		},
 		LicenseExpires: jwt.NewNumericDate(options.GraceAt),
@@ -100,6 +100,13 @@ func LicensesEntitlements(
 	//   'Entitlements' group as a whole.
 	for _, license := range licenses {
 		claims, err := ParseClaims(license.JWT, keys)
+		var vErr *jwt.ValidationError
+		if xerrors.As(err, &vErr) && vErr.Is(jwt.ErrTokenNotValidYet) {
+			// The license isn't valid yet.  We don't consider any entitlements contained in it, but
+			// it's also not an error.  Just skip it silently.  This can happen if an administrator
+			// uploads a license for a new term that hasn't started yet.
+			continue
+		}
 		if err != nil {
 			entitlements.Errors = append(entitlements.Errors,
 				fmt.Sprintf("Invalid license (%s) parsing claims: %s", license.UUID.String(), err.Error()))
@@ -287,6 +294,8 @@ var (
 	ErrInvalidVersion        = xerrors.New("license must be version 3")
 	ErrMissingKeyID          = xerrors.Errorf("JOSE header must contain %s", HeaderKeyID)
 	ErrMissingLicenseExpires = xerrors.New("license missing license_expires")
+	ErrMissingExp            = xerrors.New("exp claim missing or not parsable")
+	ErrMultipleIssues        = xerrors.New("license has multiple issues; contact support")
 )

 type Features map[codersdk.FeatureName]int64
@@ -336,7 +345,7 @@ func ParseRaw(l string, keys map[string]ed25519.PublicKey) (jwt.MapClaims, error
 	return nil, xerrors.New("unable to parse Claims")
 }

-// ParseClaims validates a database.License record, and if valid, returns the claims.  If
+// ParseClaims validates a raw JWT, and if valid, returns the claims.  If
 // unparsable or invalid, it returns an error
 func ParseClaims(rawJWT string, keys map[string]ed25519.PublicKey) (*Claims, error) {
 	tok, err := jwt.ParseWithClaims(
@@ -348,18 +357,53 @@ func ParseClaims(rawJWT string, keys map[string]ed25519.PublicKey) (*Claims, err
 	if err != nil {
 		return nil, err
 	}
-	if claims, ok := tok.Claims.(*Claims); ok && tok.Valid {
+	return validateClaims(tok)
+}
+
+func validateClaims(tok *jwt.Token) (*Claims, error) {
+	if claims, ok := tok.Claims.(*Claims); ok {
 		if claims.Version != uint64(CurrentVersion) {
 			return nil, ErrInvalidVersion
 		}
 		if claims.LicenseExpires == nil {
 			return nil, ErrMissingLicenseExpires
 		}
+		if claims.ExpiresAt == nil {
+			return nil, ErrMissingExp
+		}
 		return claims, nil
 	}
 	return nil, xerrors.New("unable to parse Claims")
 }

+// ParseClaimsIgnoreNbf validates a raw JWT, but ignores `nbf` claim. If otherwise valid, it returns
+// the claims.  If unparsable or invalid, it returns an error. Ignoring the `nbf` (not before) is
+// useful to determine if a JWT _will_ become valid at any point now or in the future.
+func ParseClaimsIgnoreNbf(rawJWT string, keys map[string]ed25519.PublicKey) (*Claims, error) {
+	tok, err := jwt.ParseWithClaims(
+		rawJWT,
+		&Claims{},
+		keyFunc(keys),
+		jwt.WithValidMethods(ValidMethods),
+	)
+	var vErr *jwt.ValidationError
+	if xerrors.As(err, &vErr) {
+		// zero out the NotValidYet error to check if there were other problems
+		vErr.Errors = vErr.Errors & (^jwt.ValidationErrorNotValidYet)
+		if vErr.Errors != 0 {
+			// There are other errors besides not being valid yet. We _could_ go
+			// through all the jwt.ValidationError bits and try to work out the
+			// correct error, but if we get here something very strange is
+			// going on so let's just return a generic error that says to get in
+			// touch with our support team.
+			return nil, ErrMultipleIssues
+		}
+	} else if err != nil {
+		return nil, err
+	}
+	return validateClaims(tok)
+}
+
 func keyFunc(keys map[string]ed25519.PublicKey) func(*jwt.Token) (interface{}, error) {
 	return func(j *jwt.Token) (interface{}, error) {
 		keyID, ok := j.Header[HeaderKeyID].(string)
@@ -826,6 +826,25 @@ func TestLicenseEntitlements(t *testing.T) {
 				assert.True(t, entitlements.Features[codersdk.FeatureCustomRoles].Enabled, "custom-roles enabled for premium")
 			},
 		},
+		{
+			Name: "CurrentAndFuture",
+			Licenses: []*coderdenttest.LicenseOptions{
+				enterpriseLicense().UserLimit(100),
+				premiumLicense().UserLimit(200).FutureTerm(time.Now()),
+			},
+			Enablements: defaultEnablements,
+			AssertEntitlements: func(t *testing.T, entitlements codersdk.Entitlements) {
+				assertEnterpriseFeatures(t, entitlements)
+				assertNoErrors(t, entitlements)
+				assertNoWarnings(t, entitlements)
+				userFeature := entitlements.Features[codersdk.FeatureUserLimit]
+				assert.Equalf(t, int64(100), *userFeature.Limit, "user limit")
+				assert.Equal(t, codersdk.EntitlementNotEntitled,
+					entitlements.Features[codersdk.FeatureMultipleOrganizations].Entitlement)
+				assert.Equal(t, codersdk.EntitlementNotEntitled,
+					entitlements.Features[codersdk.FeatureCustomRoles].Entitlement)
+			},
+		},
 	}

 	for _, tc := range testCases {
@@ -86,25 +86,7 @@ func (api *API) postLicense(rw http.ResponseWriter, r *http.Request) {
 		return
 	}

-	rawClaims, err := license.ParseRaw(addLicense.License, api.LicenseKeys)
-	if err != nil {
-		httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
-			Message: "Invalid license",
-			Detail:  err.Error(),
-		})
-		return
-	}
-	exp, ok := rawClaims["exp"].(float64)
-	if !ok {
-		httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
-			Message: "Invalid license",
-			Detail:  "exp claim missing or not parsable",
-		})
-		return
-	}
-	expTime := time.Unix(int64(exp), 0)
-
-	claims, err := license.ParseClaims(addLicense.License, api.LicenseKeys)
+	claims, err := license.ParseClaimsIgnoreNbf(addLicense.License, api.LicenseKeys)
 	if err != nil {
 		httpapi.Write(ctx, rw, http.StatusBadRequest, codersdk.Response{
 			Message: "Invalid license",
@@ -134,7 +116,7 @@ func (api *API) postLicense(rw http.ResponseWriter, r *http.Request) {
 	dl, err := api.Database.InsertLicense(ctx, database.InsertLicenseParams{
 		UploadedAt: dbtime.Now(),
 		JWT:        addLicense.License,
-		Exp:        expTime,
+		Exp:        claims.ExpiresAt.Time,
 		UUID:       id,
 	})
 	if err != nil {
@@ -160,7 +142,15 @@ func (api *API) postLicense(rw http.ResponseWriter, r *http.Request) {
 		// don't fail the HTTP request, since we did write it successfully to the database
 	}

-	httpapi.Write(ctx, rw, http.StatusCreated, convertLicense(dl, rawClaims))
+	c, err := decodeClaims(dl)
+	if err != nil {
+		httpapi.Write(ctx, rw, http.StatusInternalServerError, codersdk.Response{
+			Message: "Failed to decode database response",
+			Detail:  err.Error(),
+		})
+		return
+	}
+	httpapi.Write(ctx, rw, http.StatusCreated, convertLicense(dl, c))
 }

 // postRefreshEntitlements forces an `updateEntitlements` call and publishes
@@ -4,6 +4,7 @@ import (
 	"context"
 	"net/http"
 	"testing"
+	"time"

 	"github.com/google/uuid"
 	"github.com/stretchr/testify/assert"
@@ -82,6 +83,53 @@ func TestPostLicense(t *testing.T) {
 			t.Error("expected to get error status 400")
 		}
 	})
+
+	// Test a license that isn't yet valid, but will be in the future.  We should allow this so that
+	// operators can upload a license ahead of time.
+	t.Run("NotYet", func(t *testing.T) {
+		t.Parallel()
+		client, _ := coderdenttest.New(t, &coderdenttest.Options{DontAddLicense: true})
+		respLic := coderdenttest.AddLicense(t, client, coderdenttest.LicenseOptions{
+			AccountType: license.AccountTypeSalesforce,
+			AccountID:   "testing",
+			Features: license.Features{
+				codersdk.FeatureAuditLog: 1,
+			},
+			NotBefore: time.Now().Add(time.Hour),
+			GraceAt:   time.Now().Add(2 * time.Hour),
+			ExpiresAt: time.Now().Add(3 * time.Hour),
+		})
+		assert.GreaterOrEqual(t, respLic.ID, int32(0))
+		// just a couple spot checks for sanity
+		assert.Equal(t, "testing", respLic.Claims["account_id"])
+		features, err := respLic.FeaturesClaims()
+		require.NoError(t, err)
+		assert.EqualValues(t, 1, features[codersdk.FeatureAuditLog])
+	})
+
+	// Test we still reject a license that isn't valid yet, but has other issues (e.g. expired
+	// before it starts).
+	t.Run("NotEver", func(t *testing.T) {
+		t.Parallel()
+		client, _ := coderdenttest.New(t, &coderdenttest.Options{DontAddLicense: true})
+		lic := coderdenttest.GenerateLicense(t, coderdenttest.LicenseOptions{
+			AccountType: license.AccountTypeSalesforce,
+			AccountID:   "testing",
+			Features: license.Features{
+				codersdk.FeatureAuditLog: 1,
+			},
+			NotBefore: time.Now().Add(time.Hour),
+			GraceAt:   time.Now().Add(2 * time.Hour),
+			ExpiresAt: time.Now().Add(-time.Hour),
+		})
+		_, err := client.AddLicense(context.Background(), codersdk.AddLicenseRequest{
+			License: lic,
+		})
+		errResp := &codersdk.Error{}
+		require.ErrorAs(t, err, &errResp)
+		require.Equal(t, http.StatusBadRequest, errResp.StatusCode())
+		require.Contains(t, errResp.Detail, license.ErrMultipleIssues.Error())
+	})
 }

 func TestGetLicense(t *testing.T) {
@@ -200,6 +200,15 @@ func versionFromBinaryPath(ctx context.Context, binaryPath string) (*version.Ver
 	return version.NewVersion(vj.Version)
 }

+type textFileBusyError struct {
+	exitErr *exec.ExitError
+	stderr  string
+}
+
+func (e *textFileBusyError) Error() string {
+	return "text file busy: " + e.exitErr.String()
+}
+
 func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
 	ctx, span := e.server.startTrace(ctx, tracing.FuncName())
 	defer span.End()
@@ -216,13 +225,24 @@ func (e *executor) init(ctx, killCtx context.Context, logr logSink) error {
 		<-doneErr
 	}()

+	// As a special case, we want to look for the error "text file busy" in the stderr output of
+	// the init command, so we also take a copy of the stderr into an in memory buffer.
+	errBuf := newBufferedWriteCloser(errWriter)
+
 	args := []string{
 		"init",
 		"-no-color",
 		"-input=false",
 	}

-	return e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errWriter)
+	err := e.execWriteOutput(ctx, killCtx, args, e.basicEnv(), outWriter, errBuf)
+	var exitErr *exec.ExitError
+	if xerrors.As(err, &exitErr) {
+		if bytes.Contains(errBuf.b.Bytes(), []byte("text file busy")) {
+			return &textFileBusyError{exitErr: exitErr, stderr: errBuf.b.String()}
+		}
+	}
+	return err
 }

 func getPlanFilePath(workdir string) string {
@@ -707,3 +727,26 @@ func (sw syncWriter) Write(p []byte) (n int, err error) {
 	defer sw.mut.Unlock()
 	return sw.w.Write(p)
 }
+
+type bufferedWriteCloser struct {
+	wc io.WriteCloser
+	b  bytes.Buffer
+}
+
+func newBufferedWriteCloser(wc io.WriteCloser) *bufferedWriteCloser {
+	return &bufferedWriteCloser{
+		wc: wc,
+	}
+}
+
+func (b *bufferedWriteCloser) Write(p []byte) (int, error) {
+	n, err := b.b.Write(p)
+	if err != nil {
+		return n, err
+	}
+	return b.wc.Write(p)
+}
+
+func (b *bufferedWriteCloser) Close() error {
+	return b.wc.Close()
+}
@@ -4,7 +4,11 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"io"
+	"net"
+	"net/http"
 	"os"
+	"path/filepath"
 	"strings"
 	"time"

@@ -109,10 +113,32 @@ func (s *server) Plan(
 	initTimings.ingest(createInitTimingsEvent(timingInitStart))

 	err = e.init(ctx, killCtx, sess)
+
 	if err != nil {
 		initTimings.ingest(createInitTimingsEvent(timingInitErrored))

 		s.logger.Debug(ctx, "init failed", slog.Error(err))
+
+		// Special handling for "text file busy" c.f. https://github.com/coder/coder/issues/14726
+		// We believe this might be due to some race condition that prevents the
+		// terraform-provider-coder process from exiting.  When terraform tries to install the
+		// provider during this init, it copies over the local cache. Normally this isn't an issue,
+		// but if the terraform-provider-coder process is still running from a previous build, Linux
+		// returns "text file busy" error when attempting to open the file.
+		//
+		// Capturing the stack trace from the process should help us figure out why it has not
+		// exited.  We'll drop these diagnostics in a CRITICAL log so that operators are likely to
+		// notice, and also because it indicates this provisioner could be permanently broken and
+		// require a restart.
+		var errTFB *textFileBusyError
+		if xerrors.As(err, &errTFB) {
+			stacktrace := tryGettingCoderProviderStacktrace(sess)
+			s.logger.Critical(ctx, "init: text file busy",
+				slog.Error(errTFB),
+				slog.F("stderr", errTFB.stderr),
+				slog.F("provider_coder_stacktrace", stacktrace),
+			)
+		}
 		return provisionersdk.PlanErrorf("initialize terraform: %s", err)
 	}

@@ -280,3 +306,39 @@ func logTerraformEnvVars(sink logSink) {
 		}
 	}
 }
+
+// tryGettingCoderProviderStacktrace attempts to dial a special pprof endpoint we added to
+// terraform-provider-coder in https://github.com/coder/terraform-provider-coder/pull/295 which
+// shipped in v1.0.4.  It will return the stacktraces of the provider, which will hopefully allow us
+// to figure out why it hasn't exited.
+func tryGettingCoderProviderStacktrace(sess *provisionersdk.Session) string {
+	path := filepath.Clean(filepath.Join(sess.WorkDirectory, "../.coder/pprof"))
+	sess.Logger.Info(sess.Context(), "attempting to get stack traces", slog.F("path", path))
+	c := http.Client{
+		Transport: &http.Transport{
+			DialContext: func(ctx context.Context, _, _ string) (net.Conn, error) {
+				d := net.Dialer{}
+				return d.DialContext(ctx, "unix", path)
+			},
+		},
+	}
+	req, err := http.NewRequestWithContext(sess.Context(), http.MethodGet,
+		"http://localhost/debug/pprof/goroutine?debug=2", nil)
+	if err != nil {
+		sess.Logger.Error(sess.Context(), "error creating GET request", slog.Error(err))
+		return ""
+	}
+	resp, err := c.Do(req)
+	if err != nil {
+		// Only log at Info here, since we only added the pprof endpoint to terraform-provider-coder
+		// in v1.0.4
+		sess.Logger.Info(sess.Context(), "could not GET stack traces", slog.Error(err))
+		return ""
+	}
+	defer resp.Body.Close()
+	stacktraces, err := io.ReadAll(resp.Body)
+	if err != nil {
+		sess.Logger.Error(sess.Context(), "could not read stack traces", slog.Error(err))
+	}
+	return string(stacktraces)
+}
@@ -9,6 +9,8 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"net"
+	"net/http"
 	"os"
 	"path/filepath"
 	"runtime"
@@ -31,6 +33,8 @@ import (
 type provisionerServeOptions struct {
 	binaryPath  string
 	exitTimeout time.Duration
+	workDir     string
+	logger      *slog.Logger
 }

 func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Context, proto.DRPCProvisionerClient) {
@@ -38,7 +42,13 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont
 		opts = &provisionerServeOptions{}
 	}
 	cachePath := t.TempDir()
-	workDir := t.TempDir()
+	if opts.workDir == "" {
+		opts.workDir = t.TempDir()
+	}
+	if opts.logger == nil {
+		logger := slogtest.Make(t, nil).Leveled(slog.LevelDebug)
+		opts.logger = &logger
+	}
 	client, server := drpc.MemTransportPipe()
 	ctx, cancelFunc := context.WithCancel(context.Background())
 	serverErr := make(chan error, 1)
@@ -55,8 +65,8 @@ func setupProvisioner(t *testing.T, opts *provisionerServeOptions) (context.Cont
 		serverErr <- terraform.Serve(ctx, &terraform.ServeOptions{
 			ServeOptions: &provisionersdk.ServeOptions{
 				Listener:      server,
-				Logger:        slogtest.Make(t, nil).Leveled(slog.LevelDebug),
-				WorkDirectory: workDir,
+				Logger:        *opts.logger,
+				WorkDirectory: opts.workDir,
 			},
 			BinaryPath:  opts.binaryPath,
 			CachePath:   cachePath,
@@ -236,7 +246,7 @@ func TestProvision_CancelTimeout(t *testing.T) {
 	dir := t.TempDir()
 	binPath := filepath.Join(dir, "terraform")

-	// Example: exec /path/to/terrafork_fake_cancel.sh 1.2.1 apply "$@"
+	// Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@"
 	content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String())
 	err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec
 	require.NoError(t, err)
@@ -282,6 +292,81 @@ func TestProvision_CancelTimeout(t *testing.T) {
 	}
 }

+// below we exec fake_text_file_busy.sh, which causes the kernel to execute it, and if more than
+// one process tries to do this, it can cause "text file busy" to be returned to us. In this test
+// we want to simulate "text file busy" getting logged by terraform, due to an issue with the
+// terraform-provider-coder
+// nolint: paralleltest
+func TestProvision_TextFileBusy(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("This test uses unix sockets and is not supported on Windows")
+	}
+
+	cwd, err := os.Getwd()
+	require.NoError(t, err)
+	fakeBin := filepath.Join(cwd, "testdata", "fake_text_file_busy.sh")
+
+	dir := t.TempDir()
+	binPath := filepath.Join(dir, "terraform")
+
+	// Example: exec /path/to/terraform_fake_cancel.sh 1.2.1 apply "$@"
+	content := fmt.Sprintf("#!/bin/sh\nexec %q %s \"$@\"\n", fakeBin, terraform.TerraformVersion.String())
+	err = os.WriteFile(binPath, []byte(content), 0o755) //#nosec
+	require.NoError(t, err)
+
+	workDir := t.TempDir()
+
+	err = os.Mkdir(filepath.Join(workDir, ".coder"), 0o700)
+	require.NoError(t, err)
+	l, err := net.Listen("unix", filepath.Join(workDir, ".coder", "pprof"))
+	require.NoError(t, err)
+	defer l.Close()
+	handlerCalled := 0
+	// nolint: gosec
+	srv := &http.Server{
+		Handler: http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			assert.Equal(t, "/debug/pprof/goroutine", r.URL.Path)
+			w.WriteHeader(http.StatusOK)
+			_, err := w.Write([]byte("thestacks\n"))
+			assert.NoError(t, err)
+			handlerCalled++
+		}),
+	}
+	srvErr := make(chan error, 1)
+	go func() {
+		srvErr <- srv.Serve(l)
+	}()
+
+	logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true})
+	ctx, api := setupProvisioner(t, &provisionerServeOptions{
+		binaryPath:  binPath,
+		exitTimeout: time.Second,
+		workDir:     workDir,
+		logger:      &logger,
+	})
+
+	sess := configure(ctx, t, api, &proto.Config{
+		TemplateSourceArchive: makeTar(t, nil),
+	})
+
+	err = sendPlan(sess, proto.WorkspaceTransition_START)
+	require.NoError(t, err)
+
+	found := false
+	for {
+		msg, err := sess.Recv()
+		require.NoError(t, err)
+
+		if c := msg.GetPlan(); c != nil {
+			require.Contains(t, c.Error, "exit status 1")
+			found = true
+			break
+		}
+	}
+	require.True(t, found)
+	require.EqualValues(t, 1, handlerCalled)
+}
+
 func TestProvision(t *testing.T) {
 	t.Parallel()

@@ -0,0 +1,40 @@
+#!/bin/sh
+
+VERSION=$1
+shift 1
+
+json_print() {
+	echo "{\"@level\":\"error\",\"@message\":\"$*\"}"
+}
+
+case "$1" in
+version)
+	cat <<-EOF
+		{
+			"terraform_version": "${VERSION}",
+			"platform": "linux_amd64",
+			"provider_selections": {},
+			"terraform_outdated": false
+		}
+	EOF
+	exit 0
+	;;
+init)
+	echo "init"
+	echo >&2 "Error: Failed to install provider"
+	echo >&2 "    Error while installing coder/coder v1.0.4: open"
+	echo >&2 "    /home/coder/.cache/coder/provisioner-0/tf/registry.terraform.io/coder/coder/1.0.3/linux_amd64/terraform-provider-coder_v1.0.4:"
+	echo >&2 "    text file busy"
+	exit 1
+	;;
+plan)
+	echo "plan not supported"
+	exit 1
+	;;
+apply)
+	echo "apply not supported"
+	exit 1
+	;;
+esac
+
+exit 10
@@ -28,7 +28,7 @@ describe("useAgentLogs", () => {
 		expect(wsSpy).not.toHaveBeenCalled();
 	});

-	it("should return existing logs without network calls", async () => {
+	it("should return existing logs without network calls if state is off", async () => {
 		const queryClient = createTestQueryClient();
 		queryClient.setQueryData(
 			agentLogsKey(MockWorkspace.id, MockWorkspaceAgent.id),
@@ -39,7 +39,7 @@ describe("useAgentLogs", () => {
 		const { result } = renderUseAgentLogs(queryClient, {
 			workspaceId: MockWorkspace.id,
 			agentId: MockWorkspaceAgent.id,
-			agentLifeCycleState: "ready",
+			agentLifeCycleState: "off",
 		});
 		await waitFor(() => {
 			expect(result.current).toHaveLength(5);
@@ -48,12 +48,12 @@ describe("useAgentLogs", () => {
 		expect(wsSpy).not.toHaveBeenCalled();
 	});

-	it("should fetch logs when empty and should not connect to WebSocket when not starting", async () => {
+	it("should fetch logs when empty", async () => {
 		const queryClient = createTestQueryClient();
 		const fetchSpy = jest
 			.spyOn(API, "getWorkspaceAgentLogs")
 			.mockResolvedValueOnce(generateLogs(5));
-		const wsSpy = jest.spyOn(APIModule, "watchWorkspaceAgentLogs");
+		jest.spyOn(APIModule, "watchWorkspaceAgentLogs");
 		const { result } = renderUseAgentLogs(queryClient, {
 			workspaceId: MockWorkspace.id,
 			agentId: MockWorkspaceAgent.id,
@@ -63,10 +63,9 @@ describe("useAgentLogs", () => {
 			expect(result.current).toHaveLength(5);
 		});
 		expect(fetchSpy).toHaveBeenCalledWith(MockWorkspaceAgent.id);
-		expect(wsSpy).not.toHaveBeenCalled();
 	});

-	it("should fetch logs and connect to websocket when agent is starting", async () => {
+	it("should fetch logs and connect to websocket", async () => {
 		const queryClient = createTestQueryClient();
 		const logs = generateLogs(5);
 		const fetchSpy = jest
@@ -17,16 +17,13 @@ export type UseAgentLogsOptions = Readonly<{

 /**
 * Defines a custom hook that gives you all workspace agent logs for a given
- * workspace.
- *
- * Depending on the status of the workspace, all logs may or may not be
- * available.
+ * workspace.Depending on the status of the workspace, all logs may or may not
+ * be available.
 */
 export function useAgentLogs(
 	options: UseAgentLogsOptions,
 ): readonly WorkspaceAgentLog[] | undefined {
 	const { workspaceId, agentId, agentLifeCycleState, enabled = true } = options;
-
 	const queryClient = useQueryClient();
 	const queryOptions = agentLogs(workspaceId, agentId);
 	const { data: logs, isFetched } = useQuery({ ...queryOptions, enabled });
@@ -55,7 +52,17 @@ export function useAgentLogs(
 	});

 	useEffect(() => {
-		if (agentLifeCycleState !== "starting" || !isFetched) {
+		// Stream data only for new logs. Old logs should be loaded beforehand
+		// using a regular fetch to avoid overloading the websocket with all
+		// logs at once.
+		if (!isFetched) {
+			return;
+		}
+
+		// If the agent is off, we don't need to stream logs. This is the only state
+		// where the Coder API can't receive logs for the agent from third-party
+		// apps like envbuilder.
+		if (agentLifeCycleState === "off") {
 			return;
 		}

@@ -28,6 +28,15 @@ export const LoginPage: FC = () => {
 	const navigate = useNavigate();
 	const { metadata } = useEmbeddedMetadata();
 	const buildInfoQuery = useQuery(buildInfo(metadata["build-info"]));
+	let redirectError: Error | null = null;
+	let redirectUrl: URL | null = null;
+	try {
+		redirectUrl = new URL(redirectTo);
+	} catch {
+		// Do nothing
+	}
+
+	const isApiRouteRedirect = redirectTo.startsWith("/api/v2");

 	useEffect(() => {
 		if (!buildInfoQuery.data || isSignedIn) {
@@ -42,41 +51,24 @@ export const LoginPage: FC = () => {
 	}, [isSignedIn, buildInfoQuery.data, user?.id]);

 	if (isSignedIn) {
-		if (buildInfoQuery.data) {
-			// This uses `navigator.sendBeacon`, so window.href
-			// will not stop the request from being sent!
-			sendDeploymentEvent(buildInfoQuery.data, {
-				type: "deployment_login",
-				user_id: user?.id,
-			});
+		// The reason we need `window.location.href` for api redirects is that
+		// we need the page to reload and make a request to the backend. If we
+		// use `<Navigate>`, react would handle the redirect itself and never
+		// request the page from the backend.
+		if (isApiRouteRedirect) {
+			const sanitizedUrl = new URL(redirectTo, window.location.origin);
+			window.location.href = sanitizedUrl.pathname + sanitizedUrl.search;
+			// Setting the href should immediately request a new page. Show an
+			// error state if it doesn't.
+			redirectError = new Error("unable to redirect");
+		} else {
+			return (
+				<Navigate
+					to={redirectUrl ? redirectUrl.pathname : redirectTo}
+					replace
+				/>
+			);
 		}
-
-		// If the redirect is going to a workspace application, and we
-		// are missing authentication, then we need to change the href location
-		// to trigger a HTTP request. This allows the BE to generate the auth
-		// cookie required.  Similarly for the OAuth2 exchange as the authorization
-		// page is served by the backend.
-		// If no redirect is present, then ignore this branched logic.
-		if (redirectTo !== "" && redirectTo !== "/") {
-			try {
-				// This catches any absolute redirects. Relative redirects
-				// will fail the try/catch. Subdomain apps are absolute redirects.
-				const redirectURL = new URL(redirectTo);
-				if (redirectURL.host !== window.location.host) {
-					window.location.href = redirectTo;
-					return null;
-				}
-			} catch {
-				// Do nothing
-			}
-			// Path based apps and OAuth2.
-			if (redirectTo.includes("/apps/") || redirectTo.includes("/oauth2/")) {
-				window.location.href = redirectTo;
-				return null;
-			}
-		}
-
-		return <Navigate to={redirectTo} replace />;
 	}

 	if (isConfiguringTheFirstUser) {
@@ -90,7 +82,7 @@ export const LoginPage: FC = () => {
 			</Helmet>
 			<LoginPageView
 				authMethods={authMethodsQuery.data}
-				error={signInError}
+				error={signInError ?? redirectError}
 				isLoading={isLoading || authMethodsQuery.isLoading}
 				buildInfo={buildInfoQuery.data}
 				isSigningIn={isSigningIn}
@@ -98,6 +90,7 @@ export const LoginPage: FC = () => {
 					await signIn(email, password);
 					navigate("/");
 				}}
+				redirectTo={redirectTo}
 			/>
 		</>
 	);
@@ -6,7 +6,6 @@ import { Loader } from "components/Loader/Loader";
 import { type FC, useState } from "react";
 import { useLocation } from "react-router-dom";
 import { getApplicationName, getLogoURL } from "utils/appearance";
-import { retrieveRedirect } from "utils/redirect";
 import { SignInForm } from "./SignInForm";
 import { TermsOfServiceLink } from "./TermsOfServiceLink";

@@ -17,6 +16,7 @@ export interface LoginPageViewProps {
 	buildInfo?: BuildInfoResponse;
 	isSigningIn: boolean;
 	onSignIn: (credentials: { email: string; password: string }) => void;
+	redirectTo: string;
 }

 export const LoginPageView: FC<LoginPageViewProps> = ({
@@ -26,9 +26,9 @@ export const LoginPageView: FC<LoginPageViewProps> = ({
 	buildInfo,
 	isSigningIn,
 	onSignIn,
+	redirectTo,
 }) => {
 	const location = useLocation();
-	const redirectTo = retrieveRedirect(location.search);
 	// This allows messages to be displayed at the top of the sign in form.
 	// Helpful for any redirects that want to inform the user of something.
 	const message = new URLSearchParams(location.search).get("message");
@@ -154,7 +154,8 @@ export const WorkspaceParametersForm: FC<WorkspaceParameterFormProps> = ({
 				<FormFooter
 					onCancel={onCancel}
 					isLoading={isSubmitting}
-					submitDisabled={disabled}
+					submitLabel="Submit and restart"
+					submitDisabled={disabled || !form.dirty}
 				/>
 			</HorizontalForm>
 		</>
@@ -61,7 +61,9 @@ test("Submit the workspace settings page successfully", async () => {
 	);
 	await user.clear(parameter2);
 	await user.type(parameter2, "1");
-	await user.click(within(form).getByRole("button", { name: "Submit" }));
+	await user.click(
+		within(form).getByRole("button", { name: "Submit and restart" }),
+	);
 	// Assert that the API calls were made with the correct data
 	await waitFor(() => {
 		expect(postWorkspaceBuildSpy).toHaveBeenCalledWith(MockWorkspace.id, {
@@ -73,3 +75,58 @@ test("Submit the workspace settings page successfully", async () => {
 		});
 	});
 });
+
+test("Submit button is only enabled when changes are made", async () => {
+	// Mock the API calls that loads data
+	jest
+		.spyOn(API, "getWorkspaceByOwnerAndName")
+		.mockResolvedValueOnce(MockWorkspace);
+	jest.spyOn(API, "getTemplateVersionRichParameters").mockResolvedValueOnce([
+		MockTemplateVersionParameter1,
+		MockTemplateVersionParameter2,
+		// Immutable parameters
+		MockTemplateVersionParameter4,
+	]);
+	jest.spyOn(API, "getWorkspaceBuildParameters").mockResolvedValueOnce([
+		MockWorkspaceBuildParameter1,
+		MockWorkspaceBuildParameter2,
+		// Immutable value
+		MockWorkspaceBuildParameter4,
+	]);
+	// Setup event and rendering
+	const user = userEvent.setup();
+	renderWithWorkspaceSettingsLayout(<WorkspaceParametersPage />, {
+		route: "/@test-user/test-workspace/settings",
+		path: "/:username/:workspace/settings",
+		// Need this because after submit the user is redirected
+		extraRoutes: [{ path: "/:username/:workspace", element: <div /> }],
+	});
+	await waitForLoaderToBeRemoved();
+
+	const submitButton: HTMLButtonElement = screen.getByRole("button", {
+		name: "Submit and restart",
+	});
+
+	const form = screen.getByTestId("form");
+	const parameter1 = within(form).getByLabelText(
+		MockWorkspaceBuildParameter1.name,
+		{ exact: false },
+	);
+
+	// There are no changes, the button should be disabled.
+	expect(submitButton.disabled).toBeTruthy();
+
+	// Make changes to the form
+	await user.clear(parameter1);
+	await user.type(parameter1, "new-value");
+
+	// There are now changes, the button should be enabled.
+	expect(submitButton.disabled).toBeFalsy();
+
+	// Change form value back to default
+	await user.clear(parameter1);
+	await user.type(parameter1, MockWorkspaceBuildParameter1.value);
+
+	// There are now no changes, the button should be disabled.
+	expect(submitButton.disabled).toBeTruthy();
+});
@@ -21,6 +21,7 @@
 	"confluence.svg",
 	"container.svg",
 	"cpp.svg",
+	"cursor.svg",
 	"database.svg",
 	"datagrip.svg",
 	"dataspell.svg",
@@ -9,7 +9,7 @@ function defaultDocsUrl(): string {
 	}

 	// Strip the postfix version info that's not part of the link.
-	const i = version?.indexOf("-") ?? -1;
+	const i = version?.match(/[+-]/)?.index ?? -1;
 	if (i >= 0) {
 		version = version.slice(0, i);
 	}
Author	SHA1	Message	Date
Stephen Kirby	e4964aef23	chore: log provider stack traces on text file busy (#15249 ) re: #14726 If we see "text file busy" in the errors while initializing terraform, attempt to query the pprof endpoint set up by https://github.com/coder/terraform-provider-coder/pull/295 and log at CRITICAL. --------- Signed-off-by: Spike Curtis <spike@coder.com> (cherry picked from commit `d676ad56fe`) Co-authored-by: Spike Curtis <spike@coder.com>	2024-10-28 10:59:02 -05:00
Jon Ayers	f214d039db	fix(site): sanitize login redirect (#15208 ) (#15220 ) Co-authored-by: Colin Adler <colin1adler@gmail.com>	2024-10-24 20:49:03 +01:00
Jon Ayers	c9c90c4d29	fix: fix error handling to prevent spam in proc prio management (#15071 ) (#15095 )	2024-10-15 16:20:22 -05:00
Jon Ayers	6f68315f3b	fix(site): fix agent logs streaming for third party apps (#14541 ) (#15022 ) Co-authored-by: Bruno Quaresma <bruno@coder.com>	2024-10-08 12:52:06 -05:00
Jon Ayers	3716afac46	fix: add benign error suppression for process priority management (#15020 ) This PR backports some benign error suppression into 2.15	2024-10-08 12:02:05 -05:00
Muhammad Atif Ali	0f63510d0d	feat: add cursor IDE icon to `release/2.15` (#14962 )	2024-10-05 21:54:01 +05:00
Stephen Kirby	003dc5cc03	chore: patch known bugs in stable (#14925 ) - [x] https://github.com/coder/coder/pull/14601 - [x] https://github.com/coder/coder/pull/14602 - [x] https://github.com/coder/coder/pull/14633 --------- Co-authored-by: Justin Fowler <justinfowler1996@gmail.com> Co-authored-by: Ethan <39577870+ethanndickson@users.noreply.github.com> Co-authored-by: Danielle Maywood <danielle@themaywoods.com>	2024-10-01 17:20:26 -05:00
Stephen Kirby	190cd1c713	chore: apply fixes for the 2.15 release (#14540 ) * Minor fixups, added troubleshooting (#14519) (cherry picked from commit `66c8060605`) * fix: allow posting licenses that will be valid in future (#14491) (cherry picked from commit `5bd5801286`) * fix: stop reporting future licenses as errors (#14492) (cherry picked from commit `4eac2acede`) --------- Co-authored-by: Danny Kopping <danny@coder.com> Co-authored-by: Spike Curtis <spike@coder.com>	2024-09-03 11:31:04 -05:00