From 97e7c6503f6df861475fc008e6b6f6be27027c79 Mon Sep 17 00:00:00 2001 From: Adam Brown Date: Fri, 8 May 2026 11:38:57 +0200 Subject: [PATCH 01/10] chore(ai): Add check-code-attribution Claude Code skill (JAVA-499) Adds a check-code-attribution skill that verifies license headers + THIRD_PARTY_NOTICES.md entries for code copied or adapted from third parties. Reports any invalid headers and entries in the branch diff, along with suggestions for their correction. Implementation notes: - Uses a cheap-to-excute and easy-to-update script to detect attribution candidates (see find-attribution-candidates.sh). - Relies on the LLM for fuzzy tasks (viz, determining the adequacy of candidate attributions + generating suggestions). - Includes a shell-based test harness with 30 tests covering edge cases (see test-find-attribution-candidates.sh). --------- Co-authored-by: Claude Opus 4.6 (1M context) --- .claude/skills/.gitignore | 2 + .../CODE_ATTRIBUTION_CRITERIA.md | 13 + .../skills/check-code-attribution/SKILL.md | 210 ++++ .../find-attribution-candidates.sh | 546 +++++++++++ .../generated-code-exclusions.txt | 38 + .../test/test-find-attribution-candidates.sh | 904 ++++++++++++++++++ AGENTS.md | 10 +- 7 files changed, 1714 insertions(+), 9 deletions(-) create mode 100644 .claude/skills/check-code-attribution/CODE_ATTRIBUTION_CRITERIA.md create mode 100644 .claude/skills/check-code-attribution/SKILL.md create mode 100755 .claude/skills/check-code-attribution/find-attribution-candidates.sh create mode 100644 .claude/skills/check-code-attribution/generated-code-exclusions.txt create mode 100644 .claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh diff --git a/.claude/skills/.gitignore b/.claude/skills/.gitignore index 229f4495ee3..2dd55eba801 100644 --- a/.claude/skills/.gitignore +++ b/.claude/skills/.gitignore @@ -8,3 +8,5 @@ !test/** !btrace-perfetto/ !btrace-perfetto/** +!check-code-attribution/ +!check-code-attribution/** diff --git a/.claude/skills/check-code-attribution/CODE_ATTRIBUTION_CRITERIA.md b/.claude/skills/check-code-attribution/CODE_ATTRIBUTION_CRITERIA.md new file mode 100644 index 00000000000..6d5d1e57b0f --- /dev/null +++ b/.claude/skills/check-code-attribution/CODE_ATTRIBUTION_CRITERIA.md @@ -0,0 +1,13 @@ +# Third-Party Code Attribution + +When adapting code from third-party libraries: + +1. Add a license header at the top of the adapted file (before the `package` statement): + ```java + // Adapted from . + // Copyright . + // Licensed under the . + // + ``` + +2. Add a full attribution entry to `THIRD_PARTY_NOTICES.md` following the existing format (Source, License, Copyright, Scope, full license text) diff --git a/.claude/skills/check-code-attribution/SKILL.md b/.claude/skills/check-code-attribution/SKILL.md new file mode 100644 index 00000000000..b42fff33bd0 --- /dev/null +++ b/.claude/skills/check-code-attribution/SKILL.md @@ -0,0 +1,210 @@ +--- +name: check-code-attribution +description: Check vendored code attributions in branch diff and flag any that are deficient. Use when asked to "check attribution", "check licenses", "verify vendored code attribution", or "check code attribution". +allowed-tools: Bash, Read, Grep, Glob +--- + +# Check Code Attribution + +Verify that vendored/adapted third-party code in the current branch diff has correct license headers and `THIRD_PARTY_NOTICES.md` entries. + +## Step 1: Run the Analysis Script + +Run the pre-filter and analysis script: + +```bash +bash .claude/skills/check-code-attribution/find-attribution-candidates.sh +``` + +If the script exits with code 0, there are no candidates. Print "No attribution issues found." + +If the script exits with any other code besides 0 or 10, it failed (e.g., could not determine merge-base). Print the stderr output and **stop** — do not attempt to interpret partial output. + +If the script exits with code 10, it outputs structured blocks for two types of candidates: + +**Source-file candidates** (changed files with attribution markers): + +``` +--- +candidate_type: source_file +file: +status: A|M|D|R +reasons: +notices_match: url|none|deleted +notices_entry: +new_license_type: true|false +notices_file_exists: true|false +notices_file_changed: true|false +--- +``` + +**NOTICES-entry candidates** (entries removed or modified in `THIRD_PARTY_NOTICES.md`): + +``` +--- +candidate_type: notices_entry +notices_entry: +notices_change: removed|modified +scope_text_start: + +scope_text_end: +notices_file_exists: true|false +notices_file_changed: true|false +--- +``` + +The script deterministically handles: +- Candidate identification and filtering (attribution markers, third-party copyright/license headers, vendor-like path segments) +- Rename detection (`--find-renames`) +- Removed attribution detection (catches accidental stripping of attribution headers) +- URL-based matching to `THIRD_PARTY_NOTICES.md` entries +- New license type detection +- Deleted file identification +- Detection of whether `THIRD_PARTY_NOTICES.md` exists in the repo +- Tracking whether `THIRD_PARTY_NOTICES.md` itself was modified in the diff +- Detection of removed `THIRD_PARTY_NOTICES.md` entries (headings present in old but absent in new version) +- Detection of modified `THIRD_PARTY_NOTICES.md` entries (entry content changed between old and new version) +- Scope text extraction for removed/modified entries + +Parse the output and proceed to Step 2. + +## Step 2: Check Attribution Format + +**Skip this step for deleted files** — there is nothing to fix in a file that no longer exists. + +**Skip this step for `candidate_type: notices_entry` candidates** — these are cross-referenced in Step 3. + +Read `CODE_ATTRIBUTION_CRITERIA.md` (in this skill's directory) for the canonical attribution format. + +For each actionable non-deleted candidate, read its file content and check for the required fields from `CODE_ATTRIBUTION_CRITERIA.md`. + +Extra information beyond the required fields is fine, as are differences in formatting. Only flag **missing** fields. + +Record which fields are missing for each file. + +## Step 3: Interpret Script Results + +For each candidate, determine one **primary finding** and zero or more **additional warnings**. + +### NOTICES-entry findings (for `candidate_type: notices_entry`) + +For candidates with `candidate_type: notices_entry`, determine one finding per entry: + +1. **Entry removed** (`notices_change: removed`) — Read the `scope_text` to identify the source files/packages referenced by this entry. Check whether the referenced files still exist in the repo (use `Glob` or `Read`). If they still exist and contain third-party attribution headers referencing the removed library, flag: the NOTICES entry was removed but vendored source files still reference this library — either restore the entry or remove the vendored code. If the referenced files were also deleted in this branch (they would appear as separate `candidate_type: source_file` candidates with `status: D`), this is consistent — note it as informational only. + +2. **Entry modified** (`notices_change: modified`) — The entry content was changed. Read the current `THIRD_PARTY_NOTICES.md` entry and the source files referenced in the scope text. Compare the entry's metadata (Source, License, Copyright) against the source file headers. Flag any inconsistencies (e.g., copyright year mismatch, license mismatch). If the entry is consistent with the source files, note it as informational only. + +### Source-file primary finding (one per file — use the first that matches) + +Work through this list top-to-bottom and stop at the first match: + +1. **Deleted** (`status: D`) — report: "Deleted vendored file — verify `THIRD_PARTY_NOTICES.md` is still accurate." + +2. **Attribution removed** (`reasons` contains "attribution markers removed" but not "modified") — flag that attribution markers were removed and should be restored. Show the removed lines. + +3. **Attribution modified** (`reasons` contains "attribution markers modified") — remind the user to verify the `THIRD_PARTY_NOTICES.md` entry still matches the updated header. + +4. **Renamed** (`status: R`) — flag that the `THIRD_PARTY_NOTICES.md` Scope section likely needs updating to reflect the new path. + +5. **No NOTICES match** (`notices_match: none` and `notices_file_exists: true`) — the script could not URL-match this file to a `THIRD_PARTY_NOTICES.md` entry. Read the file's attribution header and `THIRD_PARTY_NOTICES.md`, then attempt a **fuzzy match** by library name, copyright holder, or other context. If you find a match, treat it as matched (use finding 6 instead). If not, flag that the file needs an entry added. + +6. **URL match** (`notices_match: url`) — the script found a URL match. Remind the user to verify that the matched entry's Scope section still accurately describes the vendored code. + +7. **No NOTICES file** (`notices_file_exists: false`) — `THIRD_PARTY_NOTICES.md` does not exist. Flag that it needs to be created and populated with an entry for this vendored library. + +### Additional warnings (zero or more per file, independent of each other) + +After determining the primary finding, check each of these independently. Skip all additional warnings for files whose primary finding is **Deleted** or **Attribution removed**. + +- **New license type** (`new_license_type: true`) — "-❗This license type is not yet represented in `THIRD_PARTY_NOTICES.md`. Please verify it is compatible with Sentry's licensing policies: https://open.sentry.io/licensing/." (Put this comment in its own bullet point.) + +- **NOTICES file not updated** (`notices_file_changed: false` and `notices_file_exists: true`) — if vendored code was added, modified, or renamed but `THIRD_PARTY_NOTICES.md` was not touched, remind that it may need updating. + +## Step 4: Output Results + +### No issues found + +Print "No attribution issues found." + +```bash +EXISTING_COMMENT_ID=$(gh api "repos/$OWNER/$REPO/issues/$PR_NUMBER/comments" \ + --paginate --jq '[.[] | select(.body | startswith("")) | .id] | first // empty') + +if [[ -n "$EXISTING_COMMENT_ID" ]]; then + gh api "repos/$OWNER/$REPO/issues/comments/$EXISTING_COMMENT_ID" -X DELETE +fi +``` + +### Issues found + +Print findings to the terminal, grouped by file. Prefix lines that require immediate action with ⚠️ — these represent invalid or missing attribution that must be fixed before merging. Informational reminders (verify, check) should have a 👀 prefix. + +Use the following format (only include lines that are relevant; number each entry; omit entries if the user doesn't have to do anything; don't add any sections besides Action Items and/or False Positivies; omit Action Items or False Positives sections if none found; always use the bullet point (`-`) when including a line that has a bullet point below, and put each bullet point on its own line): + +``` +******************************************************************* +* Outcome of check-code-attribution * +******************************************************************* + +|-------------------------- Action items -------------------------- + +1. ⚠️ File: + Vendored code detected () — missing required fields: + - + - +``` + +For files where attribution markers were removed: +``` +1. ⚠️ File: + Required attribution field(s) removed: + - + - +``` + +For files with a matching THIRD_PARTY_NOTICES.md entry: +``` +1. 👀 File: + Vendored code detected () – verify that `THIRD_PARTY_NOTICES.md` reflects your updates. +``` + +For renamed files: +``` +1. 👀 File: + Vendored file renamed – Verify `THIRD_PARTY_NOTICES.md` reflects your updates. +``` + +For deleted files: +``` +1. 👀 File: + Deleted vendored file – Verify `THIRD_PARTY_NOTICES.md` reflects your updates. +``` + +For removed NOTICES entries where source files still exist: +``` +1. ⚠️ NOTICES entry removed: + Source file(s) still reference this library: + - `` still contains attribution header for . Either restore the `THIRD_PARTY_NOTICES.md` entry or remove the vendored code. +``` + +For modified NOTICES entries with inconsistencies: +``` +1. ⚠️ NOTICES entry modified: + Entry metadata inconsistent with source file headers: + - +``` + +For modified NOTICES entries that are consistent: +``` +1. 👀 NOTICES entry modified: + Verify updated entry is consistent with source file headers. +``` + +``` + +|------------------------- False positives ------------------------ + + +``` + +If there are no Action Items, tell the user "✅Everything looks good. No attribution issues found." diff --git a/.claude/skills/check-code-attribution/find-attribution-candidates.sh b/.claude/skills/check-code-attribution/find-attribution-candidates.sh new file mode 100755 index 00000000000..2da0ed8431d --- /dev/null +++ b/.claude/skills/check-code-attribution/find-attribution-candidates.sh @@ -0,0 +1,546 @@ +#!/usr/bin/env bash +# shellcheck shell=bash +# +# Script for finding files in the current branch diff that may have added, modified, or removed an +# attribution for vendored code (those files are referred to as "attribution candidates"). When used +# with the check-code-attribution skill, this script lets us avoid sending work to the LLM that we +# can do ourselves. +# +# There are two types of candidates: +# +# 1. Source-file candidates (changed files with attribution markers): +# +# --- +# candidate_type: source_file +# file: +# status: A|M|D|R (A = "added", M = "modified", D = "deleted", R = "renamed") +# reasons: +# notices_match: url|none|deleted +# notices_entry: +# new_license_type: true|false +# notices_file_exists: true|false +# notices_file_changed: true|false +# --- +# +# 2. NOTICES-entry candidates (entries removed or modified in THIRD_PARTY_NOTICES.md): +# +# --- +# candidate_type: notices_entry +# notices_entry: +# notices_change: removed|modified +# scope_text_start: +# +# scope_text_end: +# notices_file_exists: true|false +# notices_file_changed: true|false +# --- +# +# Exit code 0 = no candidates found, exit code 10 = candidates found. + +set -euo pipefail + +MERGE_BASE=$(git merge-base HEAD origin/main 2>/dev/null || git merge-base HEAD main 2>/dev/null) || { + echo "Error: could not determine merge-base. Neither 'origin/main' nor 'main' is reachable from HEAD." >&2 + exit 2 +} +NOTICES_FILE="THIRD_PARTY_NOTICES.md" +notices_file_exists="true" +[[ ! -f "$NOTICES_FILE" ]] && notices_file_exists="false" +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +EXCLUSIONS_FILE="$SCRIPT_DIR/generated-code-exclusions.txt" + +# --- Filter terms --- +# Maintainer-friendly lists of terms used to identify vendored/attributed code. +# Update these when new attribution patterns or license types are encountered. + +# Strong indicators that code was adapted/copied from an external source +VENDORING_MARKERS='adapted from|backported from|copied from|derived from|ported from|translated from|vendored' + +# Recognized open-source license names (regex alternation) +LICENSE_NAMES='Apache 2\.0|Apache License|BSD [0-9]|BSD License|CC-BY|Creative Commons|Eclipse Public License|EPL|GNU General Public|GPL|ISC License|LGPL|MIT License|Mozilla Public|Public Domain|SPDX-License-Identifier|Unlicense' + +# Combined pattern for diff-hunk scanning of modified files (includes generic terms +# "copyright" and "licensed" which are appropriate for individual changed lines) +ATTRIBUTION_PATTERN="$VENDORING_MARKERS|copyright|licensed|$LICENSE_NAMES" + +# Sentry entity names — copyright lines mentioning these are treated as first-party +SENTRY_ENTITIES='functional software|getsentry|sentry software' + +# Path segments that suggest vendored/external code +VENDOR_PATH_MARKERS='external|shaded|third-party|third_party|thirdparty|vendor|vendored' + +is_binary_file() { + # -I treats binary files as non-matching → exit 1; empty pattern matches any text file → exit 0 + ! grep -Iq '' "$1" 2>/dev/null +} + +# Infrastructure/config directories that never contain vendored source code. +# For generated-file filename patterns, see generated-code-exclusions.txt. +is_excluded_path() { + [[ "$1" =~ ^\.claude/ || "$1" =~ ^\.github/ || "$1" =~ ^\.gradle/ || "$1" =~ ^\.idea/ || "$1" =~ ^\.mvn/ || "$1" =~ ^buildSrc/ || "$1" =~ ^build-logic/ || "$1" =~ ^gradle/ ]] +} + +# Load exclusion patterns once into a temp file (regexes from a repo-controlled file — review +# changes carefully). Using a file avoids re-creating a process substitution per candidate. +EXCLUSION_PATTERNS_FILE=$(mktemp) +CONTENT_FILE=$(mktemp) +OLD_NOTICES="" +trap 'rm -f "$EXCLUSION_PATTERNS_FILE" "$CONTENT_FILE" ${OLD_NOTICES:+"$OLD_NOTICES"}' EXIT +if [[ -f "$EXCLUSIONS_FILE" ]]; then + grep -v '^#' "$EXCLUSIONS_FILE" | grep -v '^$' > "$EXCLUSION_PATTERNS_FILE" || true +fi + +is_generated_file() { + [[ -s "$EXCLUSION_PATTERNS_FILE" ]] && grep -qE -f "$EXCLUSION_PATTERNS_FILE" <<< "$1" +} + +has_vendor_path() { + local path_lower + path_lower=$(echo "$1" | tr '[:upper:]' '[:lower:]') + [[ "$path_lower" =~ (^|/)($VENDOR_PATH_MARKERS)(/|$) ]] +} + +has_third_party_attribution() { + local filepath="$1" + + # Vendoring markers are strong standalone indicators + grep -qiE "$VENDORING_MARKERS" "$filepath" && return 0 + + # A non-Sentry copyright line is a strong standalone indicator. + # Note: a line like "Copyright Sentry and Example Corp" is excluded because the Sentry entity + # matches — split dual-copyright lines onto separate lines in the source to avoid this. + grep -iE 'copyright' "$filepath" | grep -qivE "$SENTRY_ENTITIES" && return 0 + + # License keywords alone (without a non-Sentry copyright or vendoring marker) do NOT + # indicate vendored code — many first-party files carry the project's own license header. + + return 1 +} + +# Check if diff hunks contain added attribution-related lines +has_added_attribution_lines() { + local diff_output="$1" + grep -E '^\+' <<< "$diff_output" | grep -vE '^\+\+\+' | grep -qiE "$ATTRIBUTION_PATTERN" +} + +# Check if diff hunks contain removed attribution-related lines +has_removed_attribution_lines() { + local diff_output="$1" + grep -E '^-' <<< "$diff_output" | grep -v '^---' | grep -qiE "$ATTRIBUTION_PATTERN" +} + +# Extract URLs from a file. Capped at 50 to bound runtime on large files while still +# catching source URLs that appear after import blocks or lengthy preambles. +extract_urls() { + grep -oE 'https?://[^ )"<>]+' "$1" | head -50 +} + +# Try to match a candidate file to a THIRD_PARTY_NOTICES.md entry by URL. +# Collects all URLs, normalizes them, builds progressively shorter prefixes, +# and runs a single awk pass against THIRD_PARTY_NOTICES.md. +# Prints: "url" if matched, "none" otherwise. +match_by_url() { + local filepath="$1" + local urls + urls=$(extract_urls "$filepath") + + [[ -z "$urls" ]] && { echo "none"; return; } + + local normalized_urls + normalized_urls=$(while IFS= read -r url; do + echo "$url" | sed -E 's|https?://||' | sed 's|^www\.||' | sed 's|[[:space:]]*$||' | sed 's|/$||' + done <<< "$urls") + + [[ -z "$normalized_urls" ]] && { echo "none"; return; } + + local result + result=$(awk ' + function count_slashes(s, i, c) { + c = 0 + for (i = 1; i <= length(s); i++) + if (substr(s, i, 1) == "/") c++ + return c + } + NR == FNR { + u = tolower($0) + while (index(u, "/") > 0) { + # Require at least 2 slashes (e.g., github.com/org/repo) to avoid + # matching at the domain or org level. + if (count_slashes(u) >= 2) + prefixes[n++] = u + sub(/\/[^\/]*$/, "", u) + } + next + } + /^## / { current_heading = $0 } + { + line = tolower($0) + for (i = 0; i < n; i++) { + if (prefixes[i] != "" && index(line, prefixes[i]) > 0 && current_heading != "") { + plen = length(prefixes[i]) + if (plen > best_len) { + best_len = plen + best_heading = current_heading + } + } + } + } + END { + if (best_len > 0) + print "url\t" best_heading + } + ' <(echo "$normalized_urls") "$NOTICES_FILE" 2>/dev/null) + + if [[ -n "$result" ]]; then + echo "$result" + else + echo "none" + fi +} + +# Extract known license types from THIRD_PARTY_NOTICES.md headings +extract_known_license_types() { + grep '^## ' "$NOTICES_FILE" 2>/dev/null | grep -oE '\([^)]+\)' | tr -d '()' | sort -u +} + +# Detect the license type declared in a file's content +detect_license_type() { + local filepath="$1" + + # SPDX identifiers are the most precise — check them first + local spdx_id + spdx_id=$(grep -oiE 'SPDX-License-Identifier:[[:space:]]*[^ ]+' "$filepath" | head -1 | sed 's/.*:[[:space:]]*//') + if [[ -n "$spdx_id" ]]; then + local spdx_lower + spdx_lower=$(echo "$spdx_id" | tr '[:upper:]' '[:lower:]') + case "$spdx_lower" in + apache-2.0) echo "Apache 2.0"; return ;; + mit) echo "MIT"; return ;; + bsd-2-clause|bsd-3-clause) echo "BSD"; return ;; + lgpl-*) echo "LGPL"; return ;; + gpl-*) echo "GPL"; return ;; + mpl-*) echo "MPL"; return ;; + epl-*) echo "EPL"; return ;; + isc) echo "ISC"; return ;; + unlicense|cc0-*) echo "Public Domain"; return ;; + cc-by-*) echo "CC-BY"; return ;; + esac + fi + + if grep -qiE 'Apache License|Apache 2\.0' "$filepath"; then + echo "Apache 2.0" + elif grep -qiE 'MIT License' "$filepath"; then + echo "MIT" + elif grep -qiE 'BSD License|BSD [0-9]' "$filepath"; then + echo "BSD" + elif grep -qiE 'LGPL' "$filepath"; then + echo "LGPL" + elif grep -qiE 'GPL|GNU General Public' "$filepath"; then + echo "GPL" + elif grep -qiE 'Mozilla Public' "$filepath"; then + echo "MPL" + elif grep -qiE 'Eclipse Public License|EPL' "$filepath"; then + echo "EPL" + elif grep -qiE 'ISC License' "$filepath"; then + echo "ISC" + elif grep -qiE 'Unlicense|Public Domain' "$filepath"; then + echo "Public Domain" + elif grep -qiE 'Creative Commons|CC-BY' "$filepath"; then + echo "CC-BY" + else + echo "unknown" + fi +} + +# Extract the full entry content for a given ## heading from a NOTICES file. +# Returns all lines between the heading and the next ## heading (or EOF). +extract_entry() { + local notices_path="$1" heading="$2" + awk -v target="$heading" ' + /^## / { + if (in_target) exit + in_target = ($0 == target) + next + } + in_target && /^```/ { in_code = !in_code } + in_target && !in_code && /^---$/ { exit } + in_target { print } + ' "$notices_path" +} + +# Extract the ### Scope section text for a given ## heading from a NOTICES file. +# Returns the descriptive paragraph(s) between "### Scope" and the next section +# boundary (---, ##, or EOF), excluding fenced code blocks. +extract_scope() { + local notices_path="$1" heading="$2" + awk -v target="$heading" ' + /^## / { in_target = ($0 == target); in_scope = 0; next } + in_target && /^### Scope/ { in_scope = 1; next } + in_target && in_scope && /^(## |---)/ { exit } + in_target && in_scope && /^```/ { in_code = !in_code; next } + in_target && in_scope && !in_code { gsub(/^[[:space:]]+|[[:space:]]+$/, ""); if ($0 != "") print } + ' "$notices_path" +} + +# Collect changed files from all sources (committed, staged, unstaged, untracked), +# deduplicated by current filepath (first occurrence wins). The committed diff is listed +# first so its status character takes precedence — a file committed as "A" (added) won't +# be overridden by a staged or unstaged "M" (modified) for the same path. +collect_changed_files() { + { + git diff "$MERGE_BASE"..HEAD --name-status --find-renames 2>/dev/null || true + git diff --cached --name-status --find-renames 2>/dev/null || true + git diff --name-status --find-renames 2>/dev/null || true + git ls-files --others --exclude-standard 2>/dev/null | while IFS= read -r path; do + [[ -n "$path" ]] && printf 'A\t%s\n' "$path" + done + } | awk -F'\t' '{ + # For renames (R###), the current path is the third field (new name). + # For everything else, the current path is the last field. + if ($1 ~ /^R/ && NF >= 3) key = $3 + else key = $NF + if (!seen[key]++) print + }' +} + +# Diff from merge-base to the current worktree state (committed + staged + unstaged) +# in a single pass. No duplicate hunks because git diffs the merge-base blob +# directly against the current worktree file. +combined_diff() { + local filepath="$1" + git diff "$MERGE_BASE" -- "$filepath" 2>/dev/null || true +} + +# Check if THIRD_PARTY_NOTICES.md was modified in this branch (committed, staged, or unstaged) +notices_file_changed="false" +if git diff "$MERGE_BASE"..HEAD --name-only -- "$NOTICES_FILE" 2>/dev/null | grep -q . \ + || git diff --cached --name-only -- "$NOTICES_FILE" 2>/dev/null | grep -q . \ + || git diff --name-only -- "$NOTICES_FILE" 2>/dev/null | grep -q .; then + notices_file_changed="true" +fi + +# Load known license types once (skip if NOTICES file doesn't exist) +KNOWN_LICENSES="" +if [[ "$notices_file_exists" == "true" ]]; then + KNOWN_LICENSES=$(extract_known_license_types) +fi + +found_any=false + +# Process each changed file +while IFS=$'\t' read -r status filepath old_filepath; do + [[ -z "$status" ]] && continue + + status_char="${status:0:1}" + + # For renames, `read` assigns: status=R###, filepath=old_path, old_filepath=new_path. + # Swap so filepath holds the current (new) path and old_filepath holds the original. + if [[ "$status_char" == "R" ]]; then + current_path="$old_filepath" + old_filepath="$filepath" + filepath="$current_path" + fi + + is_excluded_path "$filepath" && continue + is_generated_file "$filepath" && continue + + # Skip binary files + if [[ "$status_char" != "D" ]] && is_binary_file "$filepath"; then + continue + fi + is_candidate=false + reasons=() + + # --- Determine content and candidate status --- + + if [[ "$status_char" == "A" ]]; then + # Cap at 100KB to avoid overhead on large generated files + if [[ $(wc -c < "$filepath" 2>/dev/null) -gt 102400 ]]; then + echo "Warning: $filepath exceeds 100KB — only the first 100KB will be scanned for attribution markers." >&2 + fi + head -c 102400 "$filepath" > "$CONTENT_FILE" 2>/dev/null || continue + + if has_vendor_path "$filepath"; then + is_candidate=true + reasons+=("path suggests vendored code") + fi + + if has_third_party_attribution "$CONTENT_FILE"; then + is_candidate=true + reasons+=("attribution markers in file") + fi + + elif [[ "$status_char" == "D" ]]; then + git show "$MERGE_BASE":"$filepath" > "$CONTENT_FILE" 2>/dev/null || continue + + if has_vendor_path "$filepath"; then + is_candidate=true + reasons+=("deleted file in vendor path") + fi + + if has_third_party_attribution "$CONTENT_FILE"; then + is_candidate=true + reasons+=("deleted file with attribution markers") + fi + + elif [[ "$status_char" == "R" ]]; then + # Renamed file: old_filepath has the original path + if [[ $(wc -c < "$filepath" 2>/dev/null) -gt 102400 ]]; then + echo "Warning: $filepath exceeds 100KB — only the first 100KB will be scanned for attribution markers." >&2 + fi + head -c 102400 "$filepath" > "$CONTENT_FILE" 2>/dev/null || continue + + if has_vendor_path "$filepath" || has_vendor_path "${old_filepath:-}"; then + is_candidate=true + reasons+=("renamed file in vendor path") + fi + + if has_third_party_attribution "$CONTENT_FILE"; then + is_candidate=true + reasons+=("renamed file with attribution markers") + fi + + elif [[ "$status_char" == "M" ]]; then + if [[ $(wc -c < "$filepath" 2>/dev/null) -gt 102400 ]]; then + echo "Warning: $filepath exceeds 100KB — only the first 100KB will be scanned for attribution markers." >&2 + fi + head -c 102400 "$filepath" > "$CONTENT_FILE" 2>/dev/null || continue + diff_output=$(combined_diff "$filepath") + + has_added=false + has_removed=false + has_added_attribution_lines "$diff_output" && has_added=true + has_removed_attribution_lines "$diff_output" && has_removed=true + + if [[ "$has_added" == "false" && "$has_removed" == "false" ]]; then + continue + fi + + if [[ "$has_added" == "true" && "$has_removed" == "true" ]]; then + reasons+=("attribution markers modified") + elif [[ "$has_added" == "true" ]]; then + # The diff-hunk scan uses broad patterns (e.g., "copyright", "licensed") that + # match first-party headers too. When markers were only added (not removed), + # filter out files whose full content has no third-party attribution — those + # are Sentry's own license headers being added. + if ! has_third_party_attribution "$CONTENT_FILE" && ! has_vendor_path "$filepath"; then + continue + fi + reasons+=("attribution markers added") + else + reasons+=("attribution markers removed") + fi + is_candidate=true + + has_vendor_path "$filepath" && reasons+=("file in vendor path") + fi + + if [[ "$is_candidate" == "false" ]]; then + continue + fi + + # --- This file is an actionable candidate. Run deterministic checks. --- + + # THIRD_PARTY_NOTICES.md URL matching (skip for deleted files) + notices_match="none" + notices_entry="" + new_license_type="false" + + if [[ "$status_char" == "D" ]]; then + notices_match="deleted" + else + if [[ "$notices_file_exists" == "true" ]]; then + match_result=$(match_by_url "$CONTENT_FILE") + if [[ "$match_result" != "none" ]]; then + notices_match=$(echo "$match_result" | cut -f1) + notices_entry=$(echo "$match_result" | cut -f2-) + fi + + # New license type check + candidate_license=$(detect_license_type "$CONTENT_FILE") + if [[ "$candidate_license" != "unknown" ]]; then + if ! echo "$KNOWN_LICENSES" | grep -qi "$candidate_license"; then + new_license_type="true" + fi + fi + fi + fi + + # Format reasons as comma-separated string + reasons_str=$(IFS=', '; echo "${reasons[*]}") + + # Output structured block + echo "---" + echo "candidate_type: source_file" + echo "file: $filepath" + echo "status: $status_char" + echo "reasons: $reasons_str" + echo "notices_match: $notices_match" + echo "notices_entry: $notices_entry" + echo "new_license_type: $new_license_type" + echo "notices_file_exists: $notices_file_exists" + echo "notices_file_changed: $notices_file_changed" + echo "---" + + found_any=true + +done < <(collect_changed_files) + +# --- NOTICES-entry analysis (detect removed or modified entries) --- +# When THIRD_PARTY_NOTICES.md was changed, diff the old and new entry headings to find +# entries that were removed or whose content was modified. This catches the reverse +# direction: source files are unchanged but their NOTICES entry was altered. +if [[ "$notices_file_changed" == "true" ]]; then + OLD_NOTICES=$(mktemp) + git show "$MERGE_BASE":"$NOTICES_FILE" > "$OLD_NOTICES" 2>/dev/null || true + + if [[ -s "$OLD_NOTICES" ]]; then + old_headings=$(grep '^## ' "$OLD_NOTICES" | sort) + new_headings="" + if [[ -f "$NOTICES_FILE" ]]; then + new_headings=$(grep '^## ' "$NOTICES_FILE" | sort) + fi + + # Removed headings: in old but not in new + while IFS= read -r heading; do + [[ -z "$heading" ]] && continue + scope_text=$(extract_scope "$OLD_NOTICES" "$heading") + echo "---" + echo "candidate_type: notices_entry" + echo "notices_entry: $heading" + echo "notices_change: removed" + echo "scope_text_start:" + [[ -n "$scope_text" ]] && echo "$scope_text" + echo "scope_text_end:" + echo "notices_file_exists: $notices_file_exists" + echo "notices_file_changed: $notices_file_changed" + echo "---" + found_any=true + done < <(comm -23 <(echo "$old_headings") <(echo "$new_headings")) + + # Modified entries: heading present in both, but entry content changed + while IFS= read -r heading; do + [[ -z "$heading" ]] && continue + old_entry=$(extract_entry "$OLD_NOTICES" "$heading") + new_entry=$(extract_entry "$NOTICES_FILE" "$heading") + if [[ "$old_entry" != "$new_entry" ]]; then + scope_text=$(extract_scope "$OLD_NOTICES" "$heading") + echo "---" + echo "candidate_type: notices_entry" + echo "notices_entry: $heading" + echo "notices_change: modified" + echo "scope_text_start:" + [[ -n "$scope_text" ]] && echo "$scope_text" + echo "scope_text_end:" + echo "notices_file_exists: $notices_file_exists" + echo "notices_file_changed: $notices_file_changed" + echo "---" + found_any=true + fi + done < <(comm -12 <(echo "$old_headings") <(echo "$new_headings")) + fi +fi + +if [[ "$found_any" == "true" ]]; then + exit 10 +fi diff --git a/.claude/skills/check-code-attribution/generated-code-exclusions.txt b/.claude/skills/check-code-attribution/generated-code-exclusions.txt new file mode 100644 index 00000000000..235c1cf4d63 --- /dev/null +++ b/.claude/skills/check-code-attribution/generated-code-exclusions.txt @@ -0,0 +1,38 @@ +# Filename patterns for auto-generated files that don't need to satisfy the +# attribution requirements for vendored code found in CODE_ATTRIBUTION_CRITERIA.md. + +# Android (AIDL, data binding, R class, BuildConfig — only under build/generated dirs) +\.aidl$ +/databinding/.*Binding\.java$ +/generated.*/R\.java$ +/generated.*/BuildConfig\.java$ + +# ANTLR (generated output files — match only under generated dirs or with ANTLR naming) +/generated.*/.*Lexer\.java$ +/generated.*/.*Parser\.java$ +/generated.*/.*Listener\.java$ +/generated.*/.*Visitor\.java$ +\.interp$ +\.tokens$ + +# API dump files (binary compatibility tracking) +\.api$ + +# Generated source directories +/generated/ + +# Gradle build scripts +build\.gradle(\.kts)?$ +settings\.gradle(\.kts)?$ + +# gRPC (generated stubs end with Grpc.java and live under generated-source dirs) +/grpc/.*Grpc\.java$ + +# Kotlin code-generator convention +\.g\.kt$ + +# KSP (Kotlin Symbol Processing) generated output +/ksp/ + +# Protocol Buffers +\.pb\.java$ diff --git a/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh b/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh new file mode 100644 index 00000000000..9dd2c3028a9 --- /dev/null +++ b/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh @@ -0,0 +1,904 @@ +#!/usr/bin/env bash +# shellcheck shell=bash +# +# Tests for find-attribution-candidates.sh +# +# Each test creates a temporary git repo, sets up a specific scenario, +# runs the script, and asserts on its output and exit code. +# +# Usage: bash test-find-attribution-candidates.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SCRIPT="$SCRIPT_DIR/../find-attribution-candidates.sh" + +TESTS_RUN=0 +TESTS_PASSED=0 +TESTS_FAILED=0 + +RED='\033[0;31m' +GREEN='\033[0;32m' +NC='\033[0m' + +# --- Helpers --- + +setup_repo() { + local tmpdir + tmpdir=$(mktemp -d) + git -C "$tmpdir" init -b main --quiet + git -C "$tmpdir" config user.email "test@test.com" + git -C "$tmpdir" config user.name "Test" + + cat > "$tmpdir/THIRD_PARTY_NOTICES.md" << 'NOTICES' +# Third-Party Notices + +## Example Library (Apache 2.0) + +- Source: https://github.com/example/library +- License: Apache License 2.0 +- Copyright: Copyright 2024 Example Inc. +- Scope: `src/main/java/com/example/Foo.java` +NOTICES + git -C "$tmpdir" add THIRD_PARTY_NOTICES.md + git -C "$tmpdir" commit -m "Initial commit" --quiet + + echo "$tmpdir" +} + +setup_branch() { + git checkout -b test-branch --quiet +} + +cleanup_repo() { + rm -rf "$1" +} + +run_script() { + bash "$SCRIPT" 2>&1 +} + +get_field() { + local output="$1" field="$2" + echo "$output" | grep "^${field}: " | head -1 | sed "s/^${field}: //" +} + +assert_eq() { + local actual="$1" expected="$2" msg="$3" + if [[ "$actual" == "$expected" ]]; then + return 0 + fi + echo -e " ${RED}FAIL${NC}: $msg" + echo " expected: '$expected'" + echo " actual: '$actual'" + return 1 +} + +assert_contains() { + local haystack="$1" needle="$2" msg="$3" + if [[ "$haystack" == *"$needle"* ]]; then + return 0 + fi + echo -e " ${RED}FAIL${NC}: $msg" + echo " expected to contain: '$needle'" + echo " actual: '$haystack'" + return 1 +} + +get_scope_text() { + local output="$1" + echo "$output" | sed -n '/^scope_text_start:$/,/^scope_text_end:$/{ /^scope_text_start:$/d; /^scope_text_end:$/d; p; }' +} + +run_test() { + local test_name="$1" test_fn="$2" + TESTS_RUN=$((TESTS_RUN + 1)) + + local tmpdir original_dir + tmpdir=$(setup_repo) + original_dir=$(pwd) + cd "$tmpdir" + + local failed=false + echo -n "$test_name ... " + + if ! $test_fn; then + failed=true + fi + + cd "$original_dir" + cleanup_repo "$tmpdir" + + if [[ "$failed" == "true" ]]; then + TESTS_FAILED=$((TESTS_FAILED + 1)) + echo -e "${RED}FAILED${NC}" + else + TESTS_PASSED=$((TESTS_PASSED + 1)) + echo -e "${GREEN}ok${NC}" + fi +} + +# --- Test cases --- + +test_clean_branch_no_candidates() { + setup_branch + mkdir -p src + echo "package com.example; public class Clean {}" > src/Clean.java + git add src/Clean.java + git commit -m "Add clean file" --quiet + + local exit_code=0 + run_script > /dev/null 2>&1 || exit_code=$? + assert_eq "$exit_code" "0" "should exit 0 when no candidates" +} + +test_new_file_with_attribution() { + setup_branch + mkdir -p src + cat > src/Adapted.java << 'JAVA' +// Adapted from SomeLibrary. +// Copyright 2024 Some Author. +// Licensed under the Apache License 2.0. +// https://github.com/some/library +package com.example; +public class Adapted {} +JAVA + git add src/Adapted.java + git commit -m "Add adapted file" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when candidates found" || ok=false + assert_eq "$(get_field "$output" "file")" "src/Adapted.java" "file path" || ok=false + assert_eq "$(get_field "$output" "status")" "A" "status" || ok=false + assert_contains "$(get_field "$output" "reasons")" "attribution markers in file" "reasons" || ok=false + [[ "$ok" == "true" ]] +} + +test_new_file_in_vendor_path() { + setup_branch + mkdir -p vendor/lib + echo "public class Foo {}" > vendor/lib/Foo.java + git add vendor/lib/Foo.java + git commit -m "Add vendored file" --quiet + + local output ok=true + output=$(run_script) || true + assert_eq "$(get_field "$output" "file")" "vendor/lib/Foo.java" "file path" || ok=false + assert_contains "$(get_field "$output" "reasons")" "path suggests vendored code" "reasons" || ok=false + [[ "$ok" == "true" ]] +} + +test_deleted_file_with_attribution() { + mkdir -p src + cat > src/Licensed.java << 'JAVA' +// Adapted from OldLib. +// Copyright 2023 Old Author. +// Licensed under the MIT License. +package com.example; +public class Licensed {} +JAVA + git add src/Licensed.java + git commit -m "Add licensed file" --quiet + + setup_branch + git rm src/Licensed.java --quiet + git commit -m "Remove licensed file" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when candidates found" || ok=false + assert_eq "$(get_field "$output" "file")" "src/Licensed.java" "file path" || ok=false + assert_eq "$(get_field "$output" "status")" "D" "status" || ok=false + assert_contains "$(get_field "$output" "reasons")" "deleted file with attribution markers" "reasons" || ok=false + assert_eq "$(get_field "$output" "notices_match")" "deleted" "notices_match" || ok=false + [[ "$ok" == "true" ]] +} + +test_modified_file_attribution_added() { + mkdir -p src + echo "package com.example; public class Mod {}" > src/Mod.java + git add src/Mod.java + git commit -m "Add file" --quiet + + setup_branch + cat > src/Mod.java << 'JAVA' +// Adapted from NewLib. +// Copyright 2024 New Author. +// Licensed under the Apache License 2.0. +package com.example; +public class Mod {} +JAVA + git add src/Mod.java + git commit -m "Add attribution" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when candidates found" || ok=false + assert_eq "$(get_field "$output" "status")" "M" "status" || ok=false + assert_contains "$(get_field "$output" "reasons")" "attribution markers added" "reasons" || ok=false + [[ "$ok" == "true" ]] +} + +test_modified_file_attribution_removed() { + mkdir -p src + cat > src/Strip.java << 'JAVA' +// Adapted from StripLib. +// Copyright 2024 Strip Author. +// Licensed under the MIT License. +package com.example; +public class Strip {} +JAVA + git add src/Strip.java + git commit -m "Add attributed file" --quiet + + setup_branch + cat > src/Strip.java << 'JAVA' +package com.example; +public class Strip {} +JAVA + git add src/Strip.java + git commit -m "Remove attribution" --quiet + + local output ok=true + output=$(run_script) || true + assert_eq "$(get_field "$output" "status")" "M" "status" || ok=false + assert_contains "$(get_field "$output" "reasons")" "attribution markers removed" "reasons" || ok=false + [[ "$ok" == "true" ]] +} + +test_renamed_file_has_correct_path() { + mkdir -p src/old + cat > src/old/Lib.java << 'JAVA' +// Adapted from RenameLib. +// Copyright 2024 Rename Author. +// Licensed under the Apache License 2.0. +package com.example; +public class Lib {} +JAVA + git add src/old/Lib.java + git commit -m "Add file" --quiet + + setup_branch + mkdir -p src/new + git mv src/old/Lib.java src/new/Lib.java + git commit -m "Rename file" --quiet + + local output ok=true + output=$(run_script) || true + assert_eq "$(get_field "$output" "file")" "src/new/Lib.java" "file should be new path" || ok=false + assert_eq "$(get_field "$output" "status")" "R" "status" || ok=false + assert_contains "$(get_field "$output" "reasons")" "renamed file with attribution markers" "reasons" || ok=false + [[ "$ok" == "true" ]] +} + +test_staged_new_file_detected() { + setup_branch + echo "placeholder" > placeholder.txt + git add placeholder.txt + git commit -m "Placeholder" --quiet + + mkdir -p src + cat > src/Staged.java << 'JAVA' +// Adapted from StagedLib. +// Copyright 2024 Staged Author. +// Licensed under the MIT License. +package com.example; +public class Staged {} +JAVA + git add src/Staged.java + # NOT committed + + local output ok=true + output=$(run_script) || true + assert_contains "$output" "src/Staged.java" "should detect staged file" || ok=false + assert_contains "$(get_field "$output" "reasons")" "attribution markers in file" "reasons" || ok=false + [[ "$ok" == "true" ]] +} + +test_staged_modification_detected() { + mkdir -p src + echo "package com.example; public class StagedMod {}" > src/StagedMod.java + git add src/StagedMod.java + git commit -m "Add file" --quiet + + setup_branch + echo "x" > placeholder.txt + git add placeholder.txt + git commit -m "Placeholder" --quiet + + cat > src/StagedMod.java << 'JAVA' +// Adapted from StagedModLib. +// Copyright 2024 StagedMod Author. +// Licensed under the Apache License 2.0. +package com.example; +public class StagedMod {} +JAVA + git add src/StagedMod.java + # NOT committed — staged only + + local output ok=true + output=$(run_script) || true + assert_contains "$output" "src/StagedMod.java" "should detect staged modification" || ok=false + assert_contains "$(get_field "$output" "reasons")" "attribution markers added" "reasons" || ok=false + [[ "$ok" == "true" ]] +} + +test_untracked_file_detected() { + setup_branch + echo "placeholder" > placeholder.txt + git add placeholder.txt + git commit -m "Placeholder" --quiet + + mkdir -p src + cat > src/Untracked.java << 'JAVA' +// Adapted from UntrackedLib. +// Copyright 2024 Untracked Author. +// Licensed under the Apache License 2.0. +package com.example; +public class Untracked {} +JAVA + # NOT staged, NOT committed + + local output ok=true + output=$(run_script) || true + assert_contains "$output" "src/Untracked.java" "should detect untracked file" || ok=false + [[ "$ok" == "true" ]] +} + +test_notices_file_changed_true() { + setup_branch + mkdir -p src + cat > src/Noticed.java << 'JAVA' +// Adapted from NoticedLib. +// Copyright 2024 Noticed Author. +// Licensed under the Apache License 2.0. +// https://github.com/example/library +package com.example; +public class Noticed {} +JAVA + echo "## New Entry" >> THIRD_PARTY_NOTICES.md + git add src/Noticed.java THIRD_PARTY_NOTICES.md + git commit -m "Add attributed file and update notices" --quiet + + local output + output=$(run_script) || true + assert_eq "$(get_field "$output" "notices_file_changed")" "true" "notices_file_changed" +} + +test_notices_url_matching() { + setup_branch + mkdir -p src + cat > src/Matched.java << 'JAVA' +// Adapted from Example Library. +// Copyright 2024 Example Inc. +// Licensed under the Apache License 2.0. +// https://github.com/example/library +package com.example; +public class Matched {} +JAVA + git add src/Matched.java + git commit -m "Add file matching notices URL" --quiet + + local output ok=true + output=$(run_script) || true + assert_eq "$(get_field "$output" "notices_match")" "url" "notices_match" || ok=false + assert_contains "$(get_field "$output" "notices_entry")" "Example Library" "notices_entry" || ok=false + [[ "$ok" == "true" ]] +} + +test_mit_no_false_positive_from_commit() { + setup_branch + mkdir -p src + cat > src/CommitHelper.java << 'JAVA' +// Adapted from CommitLib. +// Copyright 2024 Commit Author. +package com.example; +public class CommitHelper { + // This COMMIT message should not trigger MIT detection + void commit() {} +} +JAVA + git add src/CommitHelper.java + git commit -m "Add file with COMMIT word" --quiet + + local output + output=$(run_script) || true + assert_eq "$(get_field "$output" "new_license_type")" "false" "COMMIT should not trigger MIT detection" +} + +test_excluded_paths_skipped() { + setup_branch + mkdir -p .github/workflows + cat > .github/workflows/Licensed.java << 'JAVA' +// Adapted from ExcludedLib. +// Copyright 2024 Excluded Author. +// Licensed under the MIT License. +package com.example; +public class Licensed {} +JAVA + git add .github/workflows/Licensed.java + git commit -m "Add file in excluded path" --quiet + + local exit_code=0 + run_script > /dev/null 2>&1 || exit_code=$? + assert_eq "$exit_code" "0" "excluded paths should produce no candidates" +} + +test_generated_files_skipped() { + setup_branch + mkdir -p src/generated/com/example + cat > src/generated/com/example/R.java << 'JAVA' +// Adapted from GeneratedLib. +// Copyright 2024 Generated Author. +// Licensed under the Apache License 2.0. +package com.example; +public class R {} +JAVA + git add src/generated/com/example/R.java + git commit -m "Add generated file" --quiet + + local exit_code=0 + run_script > /dev/null 2>&1 || exit_code=$? + assert_eq "$exit_code" "0" "generated files should produce no candidates" +} + +test_sentry_copyright_not_flagged() { + setup_branch + mkdir -p src + cat > src/SentryOwned.java << 'JAVA' +// Copyright 2024 Functional Software, Inc. +package com.example; +public class SentryOwned {} +JAVA + git add src/SentryOwned.java + git commit -m "Add Sentry-owned file" --quiet + + local exit_code=0 + run_script > /dev/null 2>&1 || exit_code=$? + assert_eq "$exit_code" "0" "Sentry copyright should not trigger attribution" +} + +test_new_license_type_detected() { + setup_branch + mkdir -p src + cat > src/MplFile.java << 'JAVA' +// Adapted from MplLib. +// Copyright 2024 Mpl Author. +// Licensed under the Mozilla Public License. +package com.example; +public class MplFile {} +JAVA + git add src/MplFile.java + git commit -m "Add MPL file" --quiet + + local output + output=$(run_script) || true + assert_eq "$(get_field "$output" "new_license_type")" "true" "MPL should be flagged as new license type" +} + +test_notices_staged_change_detected() { + setup_branch + mkdir -p src + cat > src/StagedNotice.java << 'JAVA' +// Adapted from StagedNoticeLib. +// Copyright 2024 StagedNotice Author. +// Licensed under the Apache License 2.0. +package com.example; +public class StagedNotice {} +JAVA + git add src/StagedNotice.java + git commit -m "Add file" --quiet + + # Stage a change to THIRD_PARTY_NOTICES.md but don't commit + echo "## Staged Entry" >> THIRD_PARTY_NOTICES.md + git add THIRD_PARTY_NOTICES.md + # NOT committed + + local output + output=$(run_script) || true + assert_eq "$(get_field "$output" "notices_file_changed")" "true" "staged notices change should be detected" +} + +test_modified_vendor_path_file_attribution_changed() { + mkdir -p vendor/lib + cat > vendor/lib/Vendored.java << 'JAVA' +// Adapted from VendoredLib. +// Copyright 2024 Vendored Author. +// Licensed under the Apache License 2.0. +// https://github.com/example/library +package com.example; +public class Vendored { + void oldMethod() {} +} +JAVA + git add vendor/lib/Vendored.java + git commit -m "Add vendored file" --quiet + + setup_branch + cat > vendor/lib/Vendored.java << 'JAVA' +// Adapted from VendoredLib. +// Copyright 2025 Vendored Author. +// Licensed under the Apache License 2.0. +// https://github.com/example/library +package com.example; +public class Vendored { + void newMethod() {} +} +JAVA + git add vendor/lib/Vendored.java + git commit -m "Update vendored file" --quiet + + local output ok=true + output=$(run_script) || true + assert_eq "$(get_field "$output" "status")" "M" "status" || ok=false + assert_contains "$(get_field "$output" "reasons")" "attribution markers modified" "reasons should include attribution change" || ok=false + assert_contains "$(get_field "$output" "reasons")" "file in vendor path" "reasons should include vendor path" || ok=false + [[ "$ok" == "true" ]] +} + +test_binary_file_skipped() { + setup_branch + mkdir -p src + # Create a file with a null byte so git treats it as binary + printf '// Adapted from BinaryLib.\n// Copyright 2024 Binary Author.\n\x00binary content' > src/Binary.dat + git add src/Binary.dat + git commit -m "Add binary file" --quiet + + local exit_code=0 + run_script > /dev/null 2>&1 || exit_code=$? + assert_eq "$exit_code" "0" "binary files should be skipped" +} + +# Many first-party Sentry files carry the project's own Apache 2.0 license header. +# A license header alone (without a vendoring marker like "Adapted from" or a +# non-Sentry copyright) does not indicate vendored code and should not be flagged. +test_license_only_header_not_flagged() { + setup_branch + mkdir -p src + cat > src/LicenseOnly.java << 'JAVA' +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + */ +package com.example; +public class LicenseOnly {} +JAVA + git add src/LicenseOnly.java + git commit -m "Add file with license-only header" --quiet + + local exit_code=0 + run_script > /dev/null 2>&1 || exit_code=$? + assert_eq "$exit_code" "0" "license-only header without third-party copyright should not trigger attribution" +} + +test_modified_vendored_file_no_attribution_change() { + mkdir -p src + cat > src/Vendored.java << 'JAVA' +// Adapted from VendoredLib. +// Copyright 2024 Vendored Author. +// Licensed under the Apache License 2.0. +// https://github.com/example/library +package com.example; +public class Vendored { + void oldMethod() {} +} +JAVA + git add src/Vendored.java + git commit -m "Add vendored file" --quiet + + setup_branch + cat > src/Vendored.java << 'JAVA' +// Adapted from VendoredLib. +// Copyright 2024 Vendored Author. +// Licensed under the Apache License 2.0. +// https://github.com/example/library +package com.example; +public class Vendored { + void newMethod() {} +} +JAVA + git add src/Vendored.java + git commit -m "Fix bug in vendored file" --quiet + + local exit_code=0 + run_script > /dev/null 2>&1 || exit_code=$? + assert_eq "$exit_code" "0" "should not flag vendored file when only non-attribution lines changed" +} + +# Adding Sentry's own license header to a first-party file should not trigger. +test_modified_first_party_license_header_not_flagged() { + mkdir -p src + echo "package com.example; public class FirstParty {}" > src/FirstParty.java + git add src/FirstParty.java + git commit -m "Add file" --quiet + + setup_branch + cat > src/FirstParty.java << 'JAVA' +/* + * Copyright 2025 Functional Software, Inc. + * Licensed under the Apache License, Version 2.0. + */ +package com.example; +public class FirstParty {} +JAVA + git add src/FirstParty.java + git commit -m "Add Sentry license header" --quiet + + local exit_code=0 + run_script > /dev/null 2>&1 || exit_code=$? + assert_eq "$exit_code" "0" "adding Sentry's own license header should not trigger attribution" +} + +test_unstaged_modification_detected() { + mkdir -p src + echo "package com.example; public class Unstaged {}" > src/Unstaged.java + git add src/Unstaged.java + git commit -m "Add file" --quiet + + setup_branch + echo "x" > placeholder.txt + git add placeholder.txt + git commit -m "Placeholder" --quiet + + cat > src/Unstaged.java << 'JAVA' +// Adapted from UnstagedLib. +// Copyright 2024 Unstaged Author. +// Licensed under the Apache License 2.0. +package com.example; +public class Unstaged {} +JAVA + # NOT staged, NOT committed — worktree only + + local output ok=true + output=$(run_script) || true + assert_contains "$output" "src/Unstaged.java" "should detect unstaged modification" || ok=false + assert_contains "$(get_field "$output" "reasons")" "attribution markers added" "reasons" || ok=false + [[ "$ok" == "true" ]] +} + +test_multiple_candidates() { + setup_branch + mkdir -p src + cat > src/First.java << 'JAVA' +// Adapted from FirstLib. +// Copyright 2024 First Author. +// Licensed under the MIT License. +package com.example; +public class First {} +JAVA + cat > src/Second.java << 'JAVA' +// Adapted from SecondLib. +// Copyright 2024 Second Author. +// Licensed under the Apache License 2.0. +package com.example; +public class Second {} +JAVA + git add src/First.java src/Second.java + git commit -m "Add two attributed files" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when candidates found" || ok=false + local count + count=$(echo "$output" | grep -c "^file: " || true) + assert_eq "$count" "2" "should find exactly 2 candidates" || ok=false + assert_contains "$output" "src/First.java" "should include First.java" || ok=false + assert_contains "$output" "src/Second.java" "should include Second.java" || ok=false + [[ "$ok" == "true" ]] +} + +test_missing_notices_file() { + git rm THIRD_PARTY_NOTICES.md --quiet + git commit -m "Remove notices file" --quiet + + setup_branch + mkdir -p src + cat > src/Vendored.java << 'JAVA' +// Adapted from SomeLib. +// Copyright 2024 Some Author. +// Licensed under the Apache License 2.0. +package com.example; +public class Vendored {} +JAVA + git add src/Vendored.java + git commit -m "Add vendored file" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should still find candidates" || ok=false + assert_eq "$(get_field "$output" "notices_file_exists")" "false" "notices_file_exists" || ok=false + assert_eq "$(get_field "$output" "notices_match")" "none" "notices_match" || ok=false + assert_eq "$(get_field "$output" "new_license_type")" "false" "new_license_type should be false when no notices file" || ok=false + [[ "$ok" == "true" ]] +} + +test_progressive_url_matching() { + setup_branch + mkdir -p src + cat > src/DeepUrl.java << 'JAVA' +// Adapted from Example Library. +// Copyright 2024 Example Inc. +// Licensed under the Apache License 2.0. +// https://github.com/example/library/tree/main/src/Foo.java +package com.example; +public class DeepUrl {} +JAVA + git add src/DeepUrl.java + git commit -m "Add file with deep URL" --quiet + + local output ok=true + output=$(run_script) || true + assert_eq "$(get_field "$output" "notices_match")" "url" "deep URL should match via progressive trimming" || ok=false + assert_contains "$(get_field "$output" "notices_entry")" "Example Library" "notices_entry" || ok=false + [[ "$ok" == "true" ]] +} + +test_notices_entry_removed() { + cat > THIRD_PARTY_NOTICES.md << 'NOTICES' +# Third-Party Notices + +## Example Library (Apache 2.0) + +**Source:** https://github.com/example/library +**License:** Apache License 2.0 +**Copyright:** Copyright 2024 Example Inc. + +### Scope + +The code resides in `com.example.Foo`. + +--- + +## Other Library (MIT) + +**Source:** https://github.com/other/library +**License:** MIT License +**Copyright:** Copyright 2024 Other Inc. + +### Scope + +The code resides in `com.other.Bar`. +NOTICES + git add THIRD_PARTY_NOTICES.md + git commit -m "Two NOTICES entries" --quiet --amend + + setup_branch + + cat > THIRD_PARTY_NOTICES.md << 'NOTICES' +# Third-Party Notices + +## Example Library (Apache 2.0) + +**Source:** https://github.com/example/library +**License:** Apache License 2.0 +**Copyright:** Copyright 2024 Example Inc. + +### Scope + +The code resides in `com.example.Foo`. +NOTICES + git add THIRD_PARTY_NOTICES.md + git commit -m "Remove Other Library entry" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when NOTICES entry removed" || ok=false + assert_contains "$output" "candidate_type: notices_entry" "should have notices_entry candidate" || ok=false + assert_contains "$output" "notices_change: removed" "should flag as removed" || ok=false + assert_contains "$output" "## Other Library (MIT)" "should identify removed entry" || ok=false + local scope + scope=$(get_scope_text "$output") + assert_contains "$scope" "com.other.Bar" "scope should reference source files" || ok=false + [[ "$ok" == "true" ]] +} + +test_notices_entry_modified() { + cat > THIRD_PARTY_NOTICES.md << 'NOTICES' +# Third-Party Notices + +## Example Library (Apache 2.0) + +**Source:** https://github.com/example/library +**License:** Apache License 2.0 +**Copyright:** Copyright 2024 Example Inc. + +### Scope + +The code resides in `com.example.Foo`. +NOTICES + git add THIRD_PARTY_NOTICES.md + git commit -m "NOTICES with entry" --quiet --amend + + setup_branch + + cat > THIRD_PARTY_NOTICES.md << 'NOTICES' +# Third-Party Notices + +## Example Library (Apache 2.0) + +**Source:** https://github.com/example/library +**License:** Apache License 2.0 +**Copyright:** Copyright 2025 Example Inc. + +### Scope + +The code resides in `com.example.Foo`. +NOTICES + git add THIRD_PARTY_NOTICES.md + git commit -m "Change copyright year in NOTICES" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when NOTICES entry modified" || ok=false + assert_contains "$output" "candidate_type: notices_entry" "should have notices_entry candidate" || ok=false + assert_contains "$output" "notices_change: modified" "should flag as modified" || ok=false + assert_contains "$output" "## Example Library (Apache 2.0)" "should identify modified entry" || ok=false + [[ "$ok" == "true" ]] +} + +test_source_file_has_candidate_type() { + setup_branch + mkdir -p src + cat > src/Typed.java << 'JAVA' +// Adapted from TypedLib. +// Copyright 2024 Typed Author. +// Licensed under the Apache License 2.0. +package com.example; +public class Typed {} +JAVA + git add src/Typed.java + git commit -m "Add typed file" --quiet + + local output ok=true + output=$(run_script) || true + assert_contains "$output" "candidate_type: source_file" "should include candidate_type field" || ok=false + [[ "$ok" == "true" ]] +} + +# --- Run all tests --- + +run_test "Clean branch — no candidates" test_clean_branch_no_candidates +run_test "New file with attribution markers" test_new_file_with_attribution +run_test "New file in vendor path" test_new_file_in_vendor_path +run_test "Deleted file with attribution" test_deleted_file_with_attribution +run_test "Modified file — attribution added" test_modified_file_attribution_added +run_test "Modified file — attribution removed" test_modified_file_attribution_removed +run_test "Renamed file — correct path" test_renamed_file_has_correct_path +run_test "Staged new file detected" test_staged_new_file_detected +run_test "Staged modification detected" test_staged_modification_detected +run_test "Untracked file detected" test_untracked_file_detected +run_test "THIRD_PARTY_NOTICES.md change — committed" test_notices_file_changed_true +run_test "THIRD_PARTY_NOTICES.md change — staged" test_notices_staged_change_detected +run_test "URL matching to notices entries" test_notices_url_matching +run_test "MIT — no false positive from COMMIT" test_mit_no_false_positive_from_commit +run_test "Excluded paths skipped" test_excluded_paths_skipped +run_test "Generated files skipped" test_generated_files_skipped +run_test "Sentry copyright not flagged" test_sentry_copyright_not_flagged +run_test "New license type detected" test_new_license_type_detected +run_test "License-only header not flagged" test_license_only_header_not_flagged +run_test "Binary file skipped" test_binary_file_skipped +run_test "Modified vendor-path file — attribution changed" test_modified_vendor_path_file_attribution_changed +run_test "Modified vendored file — no attribution change" test_modified_vendored_file_no_attribution_change +run_test "Modified first-party file — Sentry license header not flagged" test_modified_first_party_license_header_not_flagged +run_test "Unstaged modification detected" test_unstaged_modification_detected +run_test "Multiple candidates in single run" test_multiple_candidates +run_test "Missing THIRD_PARTY_NOTICES.md" test_missing_notices_file +run_test "Progressive URL matching" test_progressive_url_matching +run_test "NOTICES entry removed" test_notices_entry_removed +run_test "NOTICES entry modified" test_notices_entry_modified +run_test "Source-file candidate has candidate_type" test_source_file_has_candidate_type + +# --- Summary --- + +echo "" +echo "==============================" +echo "Tests run: $TESTS_RUN" +echo -e "Passed: ${GREEN}$TESTS_PASSED${NC}" +if [[ $TESTS_FAILED -gt 0 ]]; then + echo -e "Failed: ${RED}$TESTS_FAILED${NC}" + exit 1 +else + echo "Failed: 0" + echo -e "${GREEN}All tests passed.${NC}" +fi diff --git a/AGENTS.md b/AGENTS.md index 42a8e651004..a32e4b3a3c1 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -145,15 +145,7 @@ The repository is organized into multiple modules: 5. Consider backwards compatibility ### Third-Party Code Attribution -When adapting code from third-party libraries: -1. Add a license header at the top of the adapted file (before the `package` statement): - ```java - // Adapted from . - // Copyright . - // Licensed under the . - // - ``` -2. Add a full attribution entry to `THIRD_PARTY_NOTICES.md` following the existing format (Source, License, Copyright, Scope, full license text) +See [`.claude/skills/check-code-attribution/CODE_ATTRIBUTION_CRITERIA.md`](.claude/skills/check-code-attribution/CODE_ATTRIBUTION_CRITERIA.md). ### Getting PR Information From 5c9a6caa055e9d24fce1ba55f61819dd45d0b4f3 Mon Sep 17 00:00:00 2001 From: Adam Brown Date: Mon, 11 May 2026 16:47:44 +0200 Subject: [PATCH 02/10] Avoid pipefail crashes when grep finds no matches in find-attribution-candidates script + minor LLM prompt updates --- .../skills/check-code-attribution/SKILL.md | 22 ++++++++++--------- .../find-attribution-candidates.sh | 6 ++--- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/.claude/skills/check-code-attribution/SKILL.md b/.claude/skills/check-code-attribution/SKILL.md index b42fff33bd0..0e7059c089f 100644 --- a/.claude/skills/check-code-attribution/SKILL.md +++ b/.claude/skills/check-code-attribution/SKILL.md @@ -66,6 +66,8 @@ The script deterministically handles: - Detection of modified `THIRD_PARTY_NOTICES.md` entries (entry content changed between old and new version) - Scope text extraction for removed/modified entries +**Important:** The script checks three layers of changes: (1) committed changes on the branch vs. the merge-base, (2) staged but uncommitted changes, and (3) unstaged working-tree changes. A candidate may not appear in `git diff merge-base..HEAD` if it is only staged or only in the working tree. Do NOT dismiss a candidate as a false positive just because it is absent from the committed branch diff — check `git status`, `git diff --cached`, and `git diff` (unstaged) to see the full picture. + Parse the output and proceed to Step 2. ## Step 2: Check Attribution Format @@ -139,18 +141,18 @@ fi Print findings to the terminal, grouped by file. Prefix lines that require immediate action with ⚠️ — these represent invalid or missing attribution that must be fixed before merging. Informational reminders (verify, check) should have a 👀 prefix. -Use the following format (only include lines that are relevant; number each entry; omit entries if the user doesn't have to do anything; don't add any sections besides Action Items and/or False Positivies; omit Action Items or False Positives sections if none found; always use the bullet point (`-`) when including a line that has a bullet point below, and put each bullet point on its own line): +Use the following format (only include lines that are relevant; number each entry; omit entries if the user doesn't have to do anything; don't add any sections besides Action Items and/or False Positivies; omit Action Items or False Positives sections if none found; always use the bullet point (`-`) when including a line that has a bullet point below, and put each bullet point on its own line; wrap lines when they reach the edge of the "Outcome of check-code-attribution" box): ``` -******************************************************************* -* Outcome of check-code-attribution * -******************************************************************* +********************************************************************************** +* Outcome of check-code-attribution * +********************************************************************************** -|-------------------------- Action items -------------------------- +----------------------------------- Action items --------------------------------- 1. ⚠️ File: Vendored code detected () — missing required fields: - - + - - ``` @@ -158,7 +160,7 @@ For files where attribution markers were removed: ``` 1. ⚠️ File: Required attribution field(s) removed: - - + - - ``` @@ -202,9 +204,9 @@ For modified NOTICES entries that are consistent: ``` -|------------------------- False positives ------------------------ +---------------------------------- False positives ------------------------------- - + ``` -If there are no Action Items, tell the user "✅Everything looks good. No attribution issues found." +If there are no Action Items, print the following after *all* sections: "✅ Everything looks good. No attribution issues found." diff --git a/.claude/skills/check-code-attribution/find-attribution-candidates.sh b/.claude/skills/check-code-attribution/find-attribution-candidates.sh index 2da0ed8431d..59c1f79d884 100755 --- a/.claude/skills/check-code-attribution/find-attribution-candidates.sh +++ b/.claude/skills/check-code-attribution/find-attribution-candidates.sh @@ -132,7 +132,7 @@ has_removed_attribution_lines() { # Extract URLs from a file. Capped at 50 to bound runtime on large files while still # catching source URLs that appear after import blocks or lengthy preambles. extract_urls() { - grep -oE 'https?://[^ )"<>]+' "$1" | head -50 + { grep -oE 'https?://[^ )"<>]+' "$1" || true; } | head -50 } # Try to match a candidate file to a THIRD_PARTY_NOTICES.md entry by URL. @@ -495,10 +495,10 @@ if [[ "$notices_file_changed" == "true" ]]; then git show "$MERGE_BASE":"$NOTICES_FILE" > "$OLD_NOTICES" 2>/dev/null || true if [[ -s "$OLD_NOTICES" ]]; then - old_headings=$(grep '^## ' "$OLD_NOTICES" | sort) + old_headings=$({ grep '^## ' "$OLD_NOTICES" || true; } | sort) new_headings="" if [[ -f "$NOTICES_FILE" ]]; then - new_headings=$(grep '^## ' "$NOTICES_FILE" | sort) + new_headings=$({ grep '^## ' "$NOTICES_FILE" || true; } | sort) fi # Removed headings: in old but not in new From 89418e0bfcbea34ab289e417001a5631285ba9e1 Mon Sep 17 00:00:00 2001 From: Adam Brown Date: Tue, 12 May 2026 09:02:29 +0200 Subject: [PATCH 03/10] Fix IFS join to produce comma-space separation in attribution candidate reasons + remove the gh api comment-cleanup block as it's a holdover from an earlier CI implementation + minor cleanup. --- .../skills/check-code-attribution/SKILL.md | 19 +--- .../find-attribution-candidates.sh | 14 ++- .../test/test-find-attribution-candidates.sh | 105 ++++++++++++++++-- 3 files changed, 114 insertions(+), 24 deletions(-) diff --git a/.claude/skills/check-code-attribution/SKILL.md b/.claude/skills/check-code-attribution/SKILL.md index 0e7059c089f..d01fd033e6c 100644 --- a/.claude/skills/check-code-attribution/SKILL.md +++ b/.claude/skills/check-code-attribution/SKILL.md @@ -118,7 +118,7 @@ Work through this list top-to-bottom and stop at the first match: After determining the primary finding, check each of these independently. Skip all additional warnings for files whose primary finding is **Deleted** or **Attribution removed**. -- **New license type** (`new_license_type: true`) — "-❗This license type is not yet represented in `THIRD_PARTY_NOTICES.md`. Please verify it is compatible with Sentry's licensing policies: https://open.sentry.io/licensing/." (Put this comment in its own bullet point.) +- **New license type** (`new_license_type: true`) — "- ❗This license type is not yet represented in `THIRD_PARTY_NOTICES.md`. Please verify it is compatible with Sentry's licensing policies: https://open.sentry.io/licensing/." (Put this comment in its own bullet point.) - **NOTICES file not updated** (`notices_file_changed: false` and `notices_file_exists: true`) — if vendored code was added, modified, or renamed but `THIRD_PARTY_NOTICES.md` was not touched, remind that it may need updating. @@ -128,20 +128,11 @@ After determining the primary finding, check each of these independently. Skip a Print "No attribution issues found." -```bash -EXISTING_COMMENT_ID=$(gh api "repos/$OWNER/$REPO/issues/$PR_NUMBER/comments" \ - --paginate --jq '[.[] | select(.body | startswith("")) | .id] | first // empty') - -if [[ -n "$EXISTING_COMMENT_ID" ]]; then - gh api "repos/$OWNER/$REPO/issues/comments/$EXISTING_COMMENT_ID" -X DELETE -fi -``` - ### Issues found Print findings to the terminal, grouped by file. Prefix lines that require immediate action with ⚠️ — these represent invalid or missing attribution that must be fixed before merging. Informational reminders (verify, check) should have a 👀 prefix. -Use the following format (only include lines that are relevant; number each entry; omit entries if the user doesn't have to do anything; don't add any sections besides Action Items and/or False Positivies; omit Action Items or False Positives sections if none found; always use the bullet point (`-`) when including a line that has a bullet point below, and put each bullet point on its own line; wrap lines when they reach the edge of the "Outcome of check-code-attribution" box): +Use the following format (only include lines that are relevant; number each entry; omit entries if the user doesn't have to do anything; don't add any sections besides Action Items and/or False Positives; omit Action Items or False Positives sections if none found; always use the bullet point (`-`) when including a line that has a bullet point below, and put each bullet point on its own line; wrap lines when they reach the edge of the "Outcome of check-code-attribution" box): ``` ********************************************************************************** @@ -153,15 +144,15 @@ Use the following format (only include lines that are relevant; number each entr 1. ⚠️ File: Vendored code detected () — missing required fields: - - - + - ``` For files where attribution markers were removed: ``` 1. ⚠️ File: Required attribution field(s) removed: - - - - + - + - ``` For files with a matching THIRD_PARTY_NOTICES.md entry: diff --git a/.claude/skills/check-code-attribution/find-attribution-candidates.sh b/.claude/skills/check-code-attribution/find-attribution-candidates.sh index 59c1f79d884..eceb25f499f 100755 --- a/.claude/skills/check-code-attribution/find-attribution-candidates.sh +++ b/.claude/skills/check-code-attribution/find-attribution-candidates.sh @@ -41,6 +41,7 @@ set -euo pipefail MERGE_BASE=$(git merge-base HEAD origin/main 2>/dev/null || git merge-base HEAD main 2>/dev/null) || { echo "Error: could not determine merge-base. Neither 'origin/main' nor 'main' is reachable from HEAD." >&2 + echo "If this is a shallow clone, try: git fetch --unshallow origin main" >&2 exit 2 } NOTICES_FILE="THIRD_PARTY_NOTICES.md" @@ -254,6 +255,8 @@ detect_license_type() { # Extract the full entry content for a given ## heading from a NOTICES file. # Returns all lines between the heading and the next ## heading (or EOF). +# Note: bare "---" lines are treated as section boundaries per THIRD_PARTY_NOTICES.md +# conventions. Fenced code blocks are tracked so "---" inside ``` blocks is ignored. extract_entry() { local notices_path="$1" heading="$2" awk -v target="$heading" ' @@ -456,10 +459,15 @@ while IFS=$'\t' read -r status filepath old_filepath; do notices_entry=$(echo "$match_result" | cut -f2-) fi - # New license type check + # New license type check — uses exact whole-line matching (-xF) to avoid + # false negatives like "GPL" matching "LGPL". Tradeoff: detect_license_type + # may emit a short label (e.g. "EPL") while KNOWN_LICENSES has a versioned + # form (e.g. "EPL 2.0"), producing a false positive (new_license_type: true + # when the family is already represented). This is the safe direction — the + # LLM just emits an extra "verify compatibility" reminder. candidate_license=$(detect_license_type "$CONTENT_FILE") if [[ "$candidate_license" != "unknown" ]]; then - if ! echo "$KNOWN_LICENSES" | grep -qi "$candidate_license"; then + if ! echo "$KNOWN_LICENSES" | grep -qixF "$candidate_license"; then new_license_type="true" fi fi @@ -467,7 +475,7 @@ while IFS=$'\t' read -r status filepath old_filepath; do fi # Format reasons as comma-separated string - reasons_str=$(IFS=', '; echo "${reasons[*]}") + reasons_str=$(printf '%s, ' "${reasons[@]}" | sed 's/, $//') # Output structured block echo "---" diff --git a/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh b/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh index 9dd2c3028a9..9cf7d72e6bd 100644 --- a/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh +++ b/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh @@ -477,6 +477,54 @@ JAVA assert_eq "$(get_field "$output" "new_license_type")" "true" "MPL should be flagged as new license type" } +# When KNOWN_LICENSES contains a versioned form (e.g. "EPL 2.0") but +# detect_license_type emits a shorter label (e.g. "EPL"), the exact-match +# grep -qixF won't match. This is a known false positive (safe direction): +# the script flags new_license_type: true even though the license family is +# already represented, and the LLM emits an extra "verify compatibility" +# reminder. +test_new_license_type_false_positive_versioned_heading() { + cat > THIRD_PARTY_NOTICES.md << 'NOTICES' +# Third-Party Notices + +## Example Library (Apache 2.0) + +- Source: https://github.com/example/library +- License: Apache License 2.0 +- Copyright: Copyright 2024 Example Inc. + +--- + +## EplLib (EPL 2.0) + +- Source: https://github.com/example/epllib +- License: Eclipse Public License 2.0 +- Copyright: Copyright 2024 EPL Author. +NOTICES + git add THIRD_PARTY_NOTICES.md + git commit -m "Add EPL 2.0 entry" --quiet --amend + + setup_branch + mkdir -p src + cat > src/EplVendored.java << 'JAVA' +// Adapted from AnotherEplLib. +// Copyright 2024 Another EPL Author. +// Licensed under the Eclipse Public License. +package com.example; +public class EplVendored {} +JAVA + git add src/EplVendored.java + git commit -m "Add EPL file" --quiet + + local output ok=true + output=$(run_script) || true + # detect_license_type returns "EPL" but KNOWN_LICENSES has "EPL 2.0" — + # exact match fails, so this is flagged as a new license type (false positive). + assert_eq "$(get_field "$output" "new_license_type")" "true" \ + "known false positive: 'EPL' != 'EPL 2.0' under exact match" || ok=false + [[ "$ok" == "true" ]] +} + test_notices_staged_change_detected() { setup_branch mkdir -p src @@ -837,6 +885,46 @@ NOTICES [[ "$ok" == "true" ]] } +test_spdx_license_detection() { + setup_branch + mkdir -p src + cat > src/SpdxFile.java << 'JAVA' +// SPDX-License-Identifier: MIT +// Adapted from SpdxLib. +// Copyright 2024 Spdx Author. +package com.example; +public class SpdxFile {} +JAVA + git add src/SpdxFile.java + git commit -m "Add file with SPDX identifier" --quiet + + local output ok=true + output=$(run_script) || true + assert_eq "$(get_field "$output" "new_license_type")" "true" \ + "MIT via SPDX should be flagged as new license type (only Apache 2.0 in NOTICES)" || ok=false + [[ "$ok" == "true" ]] +} + +test_spdx_license_matches_known() { + setup_branch + mkdir -p src + cat > src/SpdxApache.java << 'JAVA' +// SPDX-License-Identifier: Apache-2.0 +// Adapted from SpdxApacheLib. +// Copyright 2024 SpdxApache Author. +package com.example; +public class SpdxApache {} +JAVA + git add src/SpdxApache.java + git commit -m "Add file with Apache SPDX identifier" --quiet + + local output ok=true + output=$(run_script) || true + assert_eq "$(get_field "$output" "new_license_type")" "false" \ + "Apache-2.0 via SPDX should match known Apache 2.0 license type" || ok=false + [[ "$ok" == "true" ]] +} + test_source_file_has_candidate_type() { setup_branch mkdir -p src @@ -876,18 +964,21 @@ run_test "Excluded paths skipped" test_excluded_paths_skipped run_test "Generated files skipped" test_generated_files_skipped run_test "Sentry copyright not flagged" test_sentry_copyright_not_flagged run_test "New license type detected" test_new_license_type_detected +run_test "New license type — false positive on versioned heading" test_new_license_type_false_positive_versioned_heading run_test "License-only header not flagged" test_license_only_header_not_flagged run_test "Binary file skipped" test_binary_file_skipped run_test "Modified vendor-path file — attribution changed" test_modified_vendor_path_file_attribution_changed run_test "Modified vendored file — no attribution change" test_modified_vendored_file_no_attribution_change run_test "Modified first-party file — Sentry license header not flagged" test_modified_first_party_license_header_not_flagged -run_test "Unstaged modification detected" test_unstaged_modification_detected -run_test "Multiple candidates in single run" test_multiple_candidates -run_test "Missing THIRD_PARTY_NOTICES.md" test_missing_notices_file -run_test "Progressive URL matching" test_progressive_url_matching -run_test "NOTICES entry removed" test_notices_entry_removed -run_test "NOTICES entry modified" test_notices_entry_modified -run_test "Source-file candidate has candidate_type" test_source_file_has_candidate_type +run_test "Unstaged modification detected" test_unstaged_modification_detected +run_test "Multiple candidates in single run" test_multiple_candidates +run_test "Missing THIRD_PARTY_NOTICES.md" test_missing_notices_file +run_test "Progressive URL matching" test_progressive_url_matching +run_test "NOTICES entry removed" test_notices_entry_removed +run_test "NOTICES entry modified" test_notices_entry_modified +run_test "SPDX license detection — new type" test_spdx_license_detection +run_test "SPDX license detection — matches known" test_spdx_license_matches_known +run_test "Source-file candidate has candidate_type" test_source_file_has_candidate_type # --- Summary --- From 7f8d9bfb6cb9ddcfaf0b4c0ab11fa653f5f061be Mon Sep 17 00:00:00 2001 From: Adam Brown Date: Tue, 12 May 2026 12:32:42 +0200 Subject: [PATCH 04/10] Fix extract_scope code block guard and bash 3.2 array expansion portability --- .../check-code-attribution/find-attribution-candidates.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.claude/skills/check-code-attribution/find-attribution-candidates.sh b/.claude/skills/check-code-attribution/find-attribution-candidates.sh index eceb25f499f..760982bb47d 100755 --- a/.claude/skills/check-code-attribution/find-attribution-candidates.sh +++ b/.claude/skills/check-code-attribution/find-attribution-candidates.sh @@ -279,7 +279,7 @@ extract_scope() { awk -v target="$heading" ' /^## / { in_target = ($0 == target); in_scope = 0; next } in_target && /^### Scope/ { in_scope = 1; next } - in_target && in_scope && /^(## |---)/ { exit } + in_target && in_scope && !in_code && /^(## |---)/ { exit } in_target && in_scope && /^```/ { in_code = !in_code; next } in_target && in_scope && !in_code { gsub(/^[[:space:]]+|[[:space:]]+$/, ""); if ($0 != "") print } ' "$notices_path" @@ -475,7 +475,7 @@ while IFS=$'\t' read -r status filepath old_filepath; do fi # Format reasons as comma-separated string - reasons_str=$(printf '%s, ' "${reasons[@]}" | sed 's/, $//') + reasons_str=$(printf '%s, ' "${reasons[@]+${reasons[@]}}" | sed 's/, $//') # Output structured block echo "---" From fdfe030ecac7393184989ae0ca5dcac1921cd445 Mon Sep 17 00:00:00 2001 From: Adam Brown Date: Tue, 12 May 2026 12:53:46 +0200 Subject: [PATCH 05/10] Fix extract_scope regex to match only bare --- separator lines --- .../check-code-attribution/find-attribution-candidates.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.claude/skills/check-code-attribution/find-attribution-candidates.sh b/.claude/skills/check-code-attribution/find-attribution-candidates.sh index 760982bb47d..411f0b4b629 100755 --- a/.claude/skills/check-code-attribution/find-attribution-candidates.sh +++ b/.claude/skills/check-code-attribution/find-attribution-candidates.sh @@ -127,7 +127,7 @@ has_added_attribution_lines() { # Check if diff hunks contain removed attribution-related lines has_removed_attribution_lines() { local diff_output="$1" - grep -E '^-' <<< "$diff_output" | grep -v '^---' | grep -qiE "$ATTRIBUTION_PATTERN" + grep -E '^-' <<< "$diff_output" | grep -v '^--- ' | grep -qiE "$ATTRIBUTION_PATTERN" } # Extract URLs from a file. Capped at 50 to bound runtime on large files while still @@ -279,7 +279,7 @@ extract_scope() { awk -v target="$heading" ' /^## / { in_target = ($0 == target); in_scope = 0; next } in_target && /^### Scope/ { in_scope = 1; next } - in_target && in_scope && !in_code && /^(## |---)/ { exit } + in_target && in_scope && !in_code && /^(## |---$)/ { exit } in_target && in_scope && /^```/ { in_code = !in_code; next } in_target && in_scope && !in_code { gsub(/^[[:space:]]+|[[:space:]]+$/, ""); if ($0 != "") print } ' "$notices_path" From a77575ff3a5d219fcc795a6f7c14cad07a03830d Mon Sep 17 00:00:00 2001 From: Adam Brown Date: Thu, 14 May 2026 14:42:53 +0200 Subject: [PATCH 06/10] Apply markushi's hybrid approach Removes the following functionality from our find-attribution-candidates script and hands it over to the LLM: 1. URL-based NOTICES matching 2. License type detection 3. "New license type" check 4. Removed/modified NOTICES entry detection --- .../skills/check-code-attribution/SKILL.md | 130 +++----- .../find-attribution-candidates.sh | 301 ++--------------- .../test/test-find-attribution-candidates.sh | 307 +----------------- 3 files changed, 68 insertions(+), 670 deletions(-) diff --git a/.claude/skills/check-code-attribution/SKILL.md b/.claude/skills/check-code-attribution/SKILL.md index d01fd033e6c..d44eb16f394 100644 --- a/.claude/skills/check-code-attribution/SKILL.md +++ b/.claude/skills/check-code-attribution/SKILL.md @@ -8,9 +8,9 @@ allowed-tools: Bash, Read, Grep, Glob Verify that vendored/adapted third-party code in the current branch diff has correct license headers and `THIRD_PARTY_NOTICES.md` entries. -## Step 1: Run the Analysis Script +## Step 1: Run the Candidate Detection Script -Run the pre-filter and analysis script: +Run the pre-filter script: ```bash bash .claude/skills/check-code-attribution/find-attribution-candidates.sh @@ -18,111 +18,64 @@ bash .claude/skills/check-code-attribution/find-attribution-candidates.sh If the script exits with code 0, there are no candidates. Print "No attribution issues found." -If the script exits with any other code besides 0 or 10, it failed (e.g., could not determine merge-base). Print the stderr output and **stop** — do not attempt to interpret partial output. +If the script exits with any code besides 0 or 10, it failed (e.g., could not determine merge-base). Print the stderr output and **stop**. -If the script exits with code 10, it outputs structured blocks for two types of candidates: +If the script exits with code 10, it outputs: -**Source-file candidates** (changed files with attribution markers): +1. **Global metadata** (first two lines): + ``` + notices_file_exists: true|false + notices_file_changed: true|false + ``` -``` ---- -candidate_type: source_file -file: -status: A|M|D|R -reasons: -notices_match: url|none|deleted -notices_entry: -new_license_type: true|false -notices_file_exists: true|false -notices_file_changed: true|false ---- -``` - -**NOTICES-entry candidates** (entries removed or modified in `THIRD_PARTY_NOTICES.md`): - -``` ---- -candidate_type: notices_entry -notices_entry: -notices_change: removed|modified -scope_text_start: - -scope_text_end: -notices_file_exists: true|false -notices_file_changed: true|false ---- -``` +2. **Candidate blocks** (one per flagged file): + ``` + --- + file: + status: A|M|D|R + reasons: + --- + ``` -The script deterministically handles: -- Candidate identification and filtering (attribution markers, third-party copyright/license headers, vendor-like path segments) -- Rename detection (`--find-renames`) -- Removed attribution detection (catches accidental stripping of attribution headers) -- URL-based matching to `THIRD_PARTY_NOTICES.md` entries -- New license type detection -- Deleted file identification -- Detection of whether `THIRD_PARTY_NOTICES.md` exists in the repo -- Tracking whether `THIRD_PARTY_NOTICES.md` itself was modified in the diff -- Detection of removed `THIRD_PARTY_NOTICES.md` entries (headings present in old but absent in new version) -- Detection of modified `THIRD_PARTY_NOTICES.md` entries (entry content changed between old and new version) -- Scope text extraction for removed/modified entries +The script handles candidate identification deterministically: changed-file collection across committed/staged/unstaged layers, path and generated-file exclusions, binary detection, vendor-path detection, attribution-marker detection, and first-party copyright filtering. **Important:** The script checks three layers of changes: (1) committed changes on the branch vs. the merge-base, (2) staged but uncommitted changes, and (3) unstaged working-tree changes. A candidate may not appear in `git diff merge-base..HEAD` if it is only staged or only in the working tree. Do NOT dismiss a candidate as a false positive just because it is absent from the committed branch diff — check `git status`, `git diff --cached`, and `git diff` (unstaged) to see the full picture. Parse the output and proceed to Step 2. -## Step 2: Check Attribution Format - -**Skip this step for deleted files** — there is nothing to fix in a file that no longer exists. - -**Skip this step for `candidate_type: notices_entry` candidates** — these are cross-referenced in Step 3. +## Step 2: Gather Context Read `CODE_ATTRIBUTION_CRITERIA.md` (in this skill's directory) for the canonical attribution format. -For each actionable non-deleted candidate, read its file content and check for the required fields from `CODE_ATTRIBUTION_CRITERIA.md`. - -Extra information beyond the required fields is fine, as are differences in formatting. Only flag **missing** fields. - -Record which fields are missing for each file. +If `notices_file_exists` is `true`, read `THIRD_PARTY_NOTICES.md` to understand existing entries. -## Step 3: Interpret Script Results +## Step 3: Analyze Each Candidate -For each candidate, determine one **primary finding** and zero or more **additional warnings**. +**Skip analysis for deleted files** (`status: D`) — go straight to the finding: "Deleted vendored file — verify `THIRD_PARTY_NOTICES.md` is still accurate." -### NOTICES-entry findings (for `candidate_type: notices_entry`) +For each non-deleted candidate: -For candidates with `candidate_type: notices_entry`, determine one finding per entry: +1. **Read the file** and check for the required attribution fields from `CODE_ATTRIBUTION_CRITERIA.md`. Extra information beyond the required fields is fine, as are differences in formatting. Only flag **missing** fields. -1. **Entry removed** (`notices_change: removed`) — Read the `scope_text` to identify the source files/packages referenced by this entry. Check whether the referenced files still exist in the repo (use `Glob` or `Read`). If they still exist and contain third-party attribution headers referencing the removed library, flag: the NOTICES entry was removed but vendored source files still reference this library — either restore the entry or remove the vendored code. If the referenced files were also deleted in this branch (they would appear as separate `candidate_type: source_file` candidates with `status: D`), this is consistent — note it as informational only. +2. **Match to a `THIRD_PARTY_NOTICES.md` entry** — Try to find a corresponding entry by URL, library name, copyright holder, or other context. Record whether you found a match, and if so, which entry. -2. **Entry modified** (`notices_change: modified`) — The entry content was changed. Read the current `THIRD_PARTY_NOTICES.md` entry and the source files referenced in the scope text. Compare the entry's metadata (Source, License, Copyright) against the source file headers. Flag any inconsistencies (e.g., copyright year mismatch, license mismatch). If the entry is consistent with the source files, note it as informational only. +3. **Check the license type** — Identify the license declared in the file's header. Check whether this license type is already represented in `THIRD_PARTY_NOTICES.md` headings. If it's a new license type not yet in NOTICES, flag it for compatibility review. -### Source-file primary finding (one per file — use the first that matches) +## Step 4: Check for NOTICES Entry Changes -Work through this list top-to-bottom and stop at the first match: +If `notices_file_changed` is `true`, compare old vs. new `THIRD_PARTY_NOTICES.md`: -1. **Deleted** (`status: D`) — report: "Deleted vendored file — verify `THIRD_PARTY_NOTICES.md` is still accurate." - -2. **Attribution removed** (`reasons` contains "attribution markers removed" but not "modified") — flag that attribution markers were removed and should be restored. Show the removed lines. - -3. **Attribution modified** (`reasons` contains "attribution markers modified") — remind the user to verify the `THIRD_PARTY_NOTICES.md` entry still matches the updated header. - -4. **Renamed** (`status: R`) — flag that the `THIRD_PARTY_NOTICES.md` Scope section likely needs updating to reflect the new path. - -5. **No NOTICES match** (`notices_match: none` and `notices_file_exists: true`) — the script could not URL-match this file to a `THIRD_PARTY_NOTICES.md` entry. Read the file's attribution header and `THIRD_PARTY_NOTICES.md`, then attempt a **fuzzy match** by library name, copyright holder, or other context. If you find a match, treat it as matched (use finding 6 instead). If not, flag that the file needs an entry added. - -6. **URL match** (`notices_match: url`) — the script found a URL match. Remind the user to verify that the matched entry's Scope section still accurately describes the vendored code. - -7. **No NOTICES file** (`notices_file_exists: false`) — `THIRD_PARTY_NOTICES.md` does not exist. Flag that it needs to be created and populated with an entry for this vendored library. - -### Additional warnings (zero or more per file, independent of each other) +```bash +git show $(git merge-base HEAD origin/main 2>/dev/null || git merge-base HEAD main):THIRD_PARTY_NOTICES.md +``` -After determining the primary finding, check each of these independently. Skip all additional warnings for files whose primary finding is **Deleted** or **Attribution removed**. +Check for: -- **New license type** (`new_license_type: true`) — "- ❗This license type is not yet represented in `THIRD_PARTY_NOTICES.md`. Please verify it is compatible with Sentry's licensing policies: https://open.sentry.io/licensing/." (Put this comment in its own bullet point.) +1. **Removed entries** — Headings present in the old version but absent in the new. For each, check whether the source files referenced in the entry's Scope section still exist in the repo (use `Glob` or `Read`). If they still exist and contain third-party attribution headers, flag: the NOTICES entry was removed but vendored source files still reference this library. If the referenced files were also deleted in this branch, note it as informational only. -- **NOTICES file not updated** (`notices_file_changed: false` and `notices_file_exists: true`) — if vendored code was added, modified, or renamed but `THIRD_PARTY_NOTICES.md` was not touched, remind that it may need updating. +2. **Modified entries** — Headings present in both but with changed content. Read the source files referenced in the entry's Scope section and compare the entry's metadata (Source, License, Copyright) against the source file headers. Flag any inconsistencies. If the entry is consistent with the source files, note it as informational only. -## Step 4: Output Results +## Step 5: Output Results ### No issues found @@ -130,9 +83,9 @@ Print "No attribution issues found." ### Issues found -Print findings to the terminal, grouped by file. Prefix lines that require immediate action with ⚠️ — these represent invalid or missing attribution that must be fixed before merging. Informational reminders (verify, check) should have a 👀 prefix. +Print findings to the terminal, grouped by file. Prefix lines that require immediate action with ⚠️. Informational reminders (verify, check) get a 👀 prefix. -Use the following format (only include lines that are relevant; number each entry; omit entries if the user doesn't have to do anything; don't add any sections besides Action Items and/or False Positives; omit Action Items or False Positives sections if none found; always use the bullet point (`-`) when including a line that has a bullet point below, and put each bullet point on its own line; wrap lines when they reach the edge of the "Outcome of check-code-attribution" box): +Use the following format (only include lines that are relevant; number each entry; omit entries if the user doesn't have to do anything; omit Action Items or False Positives sections if none found; put each bullet point on its own line; wrap lines when they reach the edge of the "Outcome of check-code-attribution" box): ``` ********************************************************************************** @@ -151,8 +104,8 @@ For files where attribution markers were removed: ``` 1. ⚠️ File: Required attribution field(s) removed: - - - - + - + - ``` For files with a matching THIRD_PARTY_NOTICES.md entry: @@ -193,6 +146,11 @@ For modified NOTICES entries that are consistent: Verify updated entry is consistent with source file headers. ``` +For new license types: +``` + - ❗This license type is not yet represented in `THIRD_PARTY_NOTICES.md`. Please verify it is compatible with Sentry's licensing policies: https://open.sentry.io/licensing/. +``` + ``` ---------------------------------- False positives ------------------------------- diff --git a/.claude/skills/check-code-attribution/find-attribution-candidates.sh b/.claude/skills/check-code-attribution/find-attribution-candidates.sh index 411f0b4b629..7e96a4ae186 100755 --- a/.claude/skills/check-code-attribution/find-attribution-candidates.sh +++ b/.claude/skills/check-code-attribution/find-attribution-candidates.sh @@ -2,37 +2,19 @@ # shellcheck shell=bash # # Script for finding files in the current branch diff that may have added, modified, or removed an -# attribution for vendored code (those files are referred to as "attribution candidates"). When used -# with the check-code-attribution skill, this script lets us avoid sending work to the LLM that we -# can do ourselves. +# attribution for vendored code (those files are referred to as "attribution candidates"). This +# script handles cheap, deterministic identification and filtering; all interpretation (cross- +# referencing with THIRD_PARTY_NOTICES.md, license classification, etc.) is left to the LLM. # -# There are two types of candidates: -# -# 1. Source-file candidates (changed files with attribution markers): +# When candidates are found, the output starts with global metadata followed by one block per +# candidate: # +# notices_file_exists: true|false +# notices_file_changed: true|false # --- -# candidate_type: source_file # file: # status: A|M|D|R (A = "added", M = "modified", D = "deleted", R = "renamed") # reasons: -# notices_match: url|none|deleted -# notices_entry: -# new_license_type: true|false -# notices_file_exists: true|false -# notices_file_changed: true|false -# --- -# -# 2. NOTICES-entry candidates (entries removed or modified in THIRD_PARTY_NOTICES.md): -# -# --- -# candidate_type: notices_entry -# notices_entry: -# notices_change: removed|modified -# scope_text_start: -# -# scope_text_end: -# notices_file_exists: true|false -# notices_file_changed: true|false # --- # # Exit code 0 = no candidates found, exit code 10 = candidates found. @@ -70,6 +52,16 @@ SENTRY_ENTITIES='functional software|getsentry|sentry software' # Path segments that suggest vendored/external code VENDOR_PATH_MARKERS='external|shaded|third-party|third_party|thirdparty|vendor|vendored' +# --- Global metadata (emitted once before any candidates) --- +global_metadata_emitted=false +emit_global_metadata() { + if [[ "$global_metadata_emitted" == "false" ]]; then + echo "notices_file_exists: $notices_file_exists" + echo "notices_file_changed: $notices_file_changed" + global_metadata_emitted=true + fi +} + is_binary_file() { # -I treats binary files as non-matching → exit 1; empty pattern matches any text file → exit 0 ! grep -Iq '' "$1" 2>/dev/null @@ -85,8 +77,7 @@ is_excluded_path() { # changes carefully). Using a file avoids re-creating a process substitution per candidate. EXCLUSION_PATTERNS_FILE=$(mktemp) CONTENT_FILE=$(mktemp) -OLD_NOTICES="" -trap 'rm -f "$EXCLUSION_PATTERNS_FILE" "$CONTENT_FILE" ${OLD_NOTICES:+"$OLD_NOTICES"}' EXIT +trap 'rm -f "$EXCLUSION_PATTERNS_FILE" "$CONTENT_FILE"' EXIT if [[ -f "$EXCLUSIONS_FILE" ]]; then grep -v '^#' "$EXCLUSIONS_FILE" | grep -v '^$' > "$EXCLUSION_PATTERNS_FILE" || true fi @@ -130,160 +121,6 @@ has_removed_attribution_lines() { grep -E '^-' <<< "$diff_output" | grep -v '^--- ' | grep -qiE "$ATTRIBUTION_PATTERN" } -# Extract URLs from a file. Capped at 50 to bound runtime on large files while still -# catching source URLs that appear after import blocks or lengthy preambles. -extract_urls() { - { grep -oE 'https?://[^ )"<>]+' "$1" || true; } | head -50 -} - -# Try to match a candidate file to a THIRD_PARTY_NOTICES.md entry by URL. -# Collects all URLs, normalizes them, builds progressively shorter prefixes, -# and runs a single awk pass against THIRD_PARTY_NOTICES.md. -# Prints: "url" if matched, "none" otherwise. -match_by_url() { - local filepath="$1" - local urls - urls=$(extract_urls "$filepath") - - [[ -z "$urls" ]] && { echo "none"; return; } - - local normalized_urls - normalized_urls=$(while IFS= read -r url; do - echo "$url" | sed -E 's|https?://||' | sed 's|^www\.||' | sed 's|[[:space:]]*$||' | sed 's|/$||' - done <<< "$urls") - - [[ -z "$normalized_urls" ]] && { echo "none"; return; } - - local result - result=$(awk ' - function count_slashes(s, i, c) { - c = 0 - for (i = 1; i <= length(s); i++) - if (substr(s, i, 1) == "/") c++ - return c - } - NR == FNR { - u = tolower($0) - while (index(u, "/") > 0) { - # Require at least 2 slashes (e.g., github.com/org/repo) to avoid - # matching at the domain or org level. - if (count_slashes(u) >= 2) - prefixes[n++] = u - sub(/\/[^\/]*$/, "", u) - } - next - } - /^## / { current_heading = $0 } - { - line = tolower($0) - for (i = 0; i < n; i++) { - if (prefixes[i] != "" && index(line, prefixes[i]) > 0 && current_heading != "") { - plen = length(prefixes[i]) - if (plen > best_len) { - best_len = plen - best_heading = current_heading - } - } - } - } - END { - if (best_len > 0) - print "url\t" best_heading - } - ' <(echo "$normalized_urls") "$NOTICES_FILE" 2>/dev/null) - - if [[ -n "$result" ]]; then - echo "$result" - else - echo "none" - fi -} - -# Extract known license types from THIRD_PARTY_NOTICES.md headings -extract_known_license_types() { - grep '^## ' "$NOTICES_FILE" 2>/dev/null | grep -oE '\([^)]+\)' | tr -d '()' | sort -u -} - -# Detect the license type declared in a file's content -detect_license_type() { - local filepath="$1" - - # SPDX identifiers are the most precise — check them first - local spdx_id - spdx_id=$(grep -oiE 'SPDX-License-Identifier:[[:space:]]*[^ ]+' "$filepath" | head -1 | sed 's/.*:[[:space:]]*//') - if [[ -n "$spdx_id" ]]; then - local spdx_lower - spdx_lower=$(echo "$spdx_id" | tr '[:upper:]' '[:lower:]') - case "$spdx_lower" in - apache-2.0) echo "Apache 2.0"; return ;; - mit) echo "MIT"; return ;; - bsd-2-clause|bsd-3-clause) echo "BSD"; return ;; - lgpl-*) echo "LGPL"; return ;; - gpl-*) echo "GPL"; return ;; - mpl-*) echo "MPL"; return ;; - epl-*) echo "EPL"; return ;; - isc) echo "ISC"; return ;; - unlicense|cc0-*) echo "Public Domain"; return ;; - cc-by-*) echo "CC-BY"; return ;; - esac - fi - - if grep -qiE 'Apache License|Apache 2\.0' "$filepath"; then - echo "Apache 2.0" - elif grep -qiE 'MIT License' "$filepath"; then - echo "MIT" - elif grep -qiE 'BSD License|BSD [0-9]' "$filepath"; then - echo "BSD" - elif grep -qiE 'LGPL' "$filepath"; then - echo "LGPL" - elif grep -qiE 'GPL|GNU General Public' "$filepath"; then - echo "GPL" - elif grep -qiE 'Mozilla Public' "$filepath"; then - echo "MPL" - elif grep -qiE 'Eclipse Public License|EPL' "$filepath"; then - echo "EPL" - elif grep -qiE 'ISC License' "$filepath"; then - echo "ISC" - elif grep -qiE 'Unlicense|Public Domain' "$filepath"; then - echo "Public Domain" - elif grep -qiE 'Creative Commons|CC-BY' "$filepath"; then - echo "CC-BY" - else - echo "unknown" - fi -} - -# Extract the full entry content for a given ## heading from a NOTICES file. -# Returns all lines between the heading and the next ## heading (or EOF). -# Note: bare "---" lines are treated as section boundaries per THIRD_PARTY_NOTICES.md -# conventions. Fenced code blocks are tracked so "---" inside ``` blocks is ignored. -extract_entry() { - local notices_path="$1" heading="$2" - awk -v target="$heading" ' - /^## / { - if (in_target) exit - in_target = ($0 == target) - next - } - in_target && /^```/ { in_code = !in_code } - in_target && !in_code && /^---$/ { exit } - in_target { print } - ' "$notices_path" -} - -# Extract the ### Scope section text for a given ## heading from a NOTICES file. -# Returns the descriptive paragraph(s) between "### Scope" and the next section -# boundary (---, ##, or EOF), excluding fenced code blocks. -extract_scope() { - local notices_path="$1" heading="$2" - awk -v target="$heading" ' - /^## / { in_target = ($0 == target); in_scope = 0; next } - in_target && /^### Scope/ { in_scope = 1; next } - in_target && in_scope && !in_code && /^(## |---$)/ { exit } - in_target && in_scope && /^```/ { in_code = !in_code; next } - in_target && in_scope && !in_code { gsub(/^[[:space:]]+|[[:space:]]+$/, ""); if ($0 != "") print } - ' "$notices_path" -} # Collect changed files from all sources (committed, staged, unstaged, untracked), # deduplicated by current filepath (first occurrence wins). The committed diff is listed @@ -322,12 +159,6 @@ if git diff "$MERGE_BASE"..HEAD --name-only -- "$NOTICES_FILE" 2>/dev/null | gre notices_file_changed="true" fi -# Load known license types once (skip if NOTICES file doesn't exist) -KNOWN_LICENSES="" -if [[ "$notices_file_exists" == "true" ]]; then - KNOWN_LICENSES=$(extract_known_license_types) -fi - found_any=false # Process each changed file @@ -442,113 +273,23 @@ while IFS=$'\t' read -r status filepath old_filepath; do continue fi - # --- This file is an actionable candidate. Run deterministic checks. --- - - # THIRD_PARTY_NOTICES.md URL matching (skip for deleted files) - notices_match="none" - notices_entry="" - new_license_type="false" - - if [[ "$status_char" == "D" ]]; then - notices_match="deleted" - else - if [[ "$notices_file_exists" == "true" ]]; then - match_result=$(match_by_url "$CONTENT_FILE") - if [[ "$match_result" != "none" ]]; then - notices_match=$(echo "$match_result" | cut -f1) - notices_entry=$(echo "$match_result" | cut -f2-) - fi - - # New license type check — uses exact whole-line matching (-xF) to avoid - # false negatives like "GPL" matching "LGPL". Tradeoff: detect_license_type - # may emit a short label (e.g. "EPL") while KNOWN_LICENSES has a versioned - # form (e.g. "EPL 2.0"), producing a false positive (new_license_type: true - # when the family is already represented). This is the safe direction — the - # LLM just emits an extra "verify compatibility" reminder. - candidate_license=$(detect_license_type "$CONTENT_FILE") - if [[ "$candidate_license" != "unknown" ]]; then - if ! echo "$KNOWN_LICENSES" | grep -qixF "$candidate_license"; then - new_license_type="true" - fi - fi - fi - fi - # Format reasons as comma-separated string reasons_str=$(printf '%s, ' "${reasons[@]+${reasons[@]}}" | sed 's/, $//') + # Emit global metadata once before the first candidate + emit_global_metadata + # Output structured block echo "---" - echo "candidate_type: source_file" echo "file: $filepath" echo "status: $status_char" echo "reasons: $reasons_str" - echo "notices_match: $notices_match" - echo "notices_entry: $notices_entry" - echo "new_license_type: $new_license_type" - echo "notices_file_exists: $notices_file_exists" - echo "notices_file_changed: $notices_file_changed" echo "---" found_any=true done < <(collect_changed_files) -# --- NOTICES-entry analysis (detect removed or modified entries) --- -# When THIRD_PARTY_NOTICES.md was changed, diff the old and new entry headings to find -# entries that were removed or whose content was modified. This catches the reverse -# direction: source files are unchanged but their NOTICES entry was altered. -if [[ "$notices_file_changed" == "true" ]]; then - OLD_NOTICES=$(mktemp) - git show "$MERGE_BASE":"$NOTICES_FILE" > "$OLD_NOTICES" 2>/dev/null || true - - if [[ -s "$OLD_NOTICES" ]]; then - old_headings=$({ grep '^## ' "$OLD_NOTICES" || true; } | sort) - new_headings="" - if [[ -f "$NOTICES_FILE" ]]; then - new_headings=$({ grep '^## ' "$NOTICES_FILE" || true; } | sort) - fi - - # Removed headings: in old but not in new - while IFS= read -r heading; do - [[ -z "$heading" ]] && continue - scope_text=$(extract_scope "$OLD_NOTICES" "$heading") - echo "---" - echo "candidate_type: notices_entry" - echo "notices_entry: $heading" - echo "notices_change: removed" - echo "scope_text_start:" - [[ -n "$scope_text" ]] && echo "$scope_text" - echo "scope_text_end:" - echo "notices_file_exists: $notices_file_exists" - echo "notices_file_changed: $notices_file_changed" - echo "---" - found_any=true - done < <(comm -23 <(echo "$old_headings") <(echo "$new_headings")) - - # Modified entries: heading present in both, but entry content changed - while IFS= read -r heading; do - [[ -z "$heading" ]] && continue - old_entry=$(extract_entry "$OLD_NOTICES" "$heading") - new_entry=$(extract_entry "$NOTICES_FILE" "$heading") - if [[ "$old_entry" != "$new_entry" ]]; then - scope_text=$(extract_scope "$OLD_NOTICES" "$heading") - echo "---" - echo "candidate_type: notices_entry" - echo "notices_entry: $heading" - echo "notices_change: modified" - echo "scope_text_start:" - [[ -n "$scope_text" ]] && echo "$scope_text" - echo "scope_text_end:" - echo "notices_file_exists: $notices_file_exists" - echo "notices_file_changed: $notices_file_changed" - echo "---" - found_any=true - fi - done < <(comm -12 <(echo "$old_headings") <(echo "$new_headings")) - fi -fi - if [[ "$found_any" == "true" ]]; then exit 10 fi diff --git a/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh b/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh index 9cf7d72e6bd..d059b1ea750 100644 --- a/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh +++ b/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh @@ -85,11 +85,6 @@ assert_contains() { return 1 } -get_scope_text() { - local output="$1" - echo "$output" | sed -n '/^scope_text_start:$/,/^scope_text_end:$/{ /^scope_text_start:$/d; /^scope_text_end:$/d; p; }' -} - run_test() { local test_name="$1" test_fn="$2" TESTS_RUN=$((TESTS_RUN + 1)) @@ -191,7 +186,6 @@ JAVA assert_eq "$(get_field "$output" "file")" "src/Licensed.java" "file path" || ok=false assert_eq "$(get_field "$output" "status")" "D" "status" || ok=false assert_contains "$(get_field "$output" "reasons")" "deleted file with attribution markers" "reasons" || ok=false - assert_eq "$(get_field "$output" "notices_match")" "deleted" "notices_match" || ok=false [[ "$ok" == "true" ]] } @@ -363,49 +357,9 @@ JAVA local output output=$(run_script) || true - assert_eq "$(get_field "$output" "notices_file_changed")" "true" "notices_file_changed" + assert_contains "$output" "notices_file_changed: true" "global notices_file_changed" } -test_notices_url_matching() { - setup_branch - mkdir -p src - cat > src/Matched.java << 'JAVA' -// Adapted from Example Library. -// Copyright 2024 Example Inc. -// Licensed under the Apache License 2.0. -// https://github.com/example/library -package com.example; -public class Matched {} -JAVA - git add src/Matched.java - git commit -m "Add file matching notices URL" --quiet - - local output ok=true - output=$(run_script) || true - assert_eq "$(get_field "$output" "notices_match")" "url" "notices_match" || ok=false - assert_contains "$(get_field "$output" "notices_entry")" "Example Library" "notices_entry" || ok=false - [[ "$ok" == "true" ]] -} - -test_mit_no_false_positive_from_commit() { - setup_branch - mkdir -p src - cat > src/CommitHelper.java << 'JAVA' -// Adapted from CommitLib. -// Copyright 2024 Commit Author. -package com.example; -public class CommitHelper { - // This COMMIT message should not trigger MIT detection - void commit() {} -} -JAVA - git add src/CommitHelper.java - git commit -m "Add file with COMMIT word" --quiet - - local output - output=$(run_script) || true - assert_eq "$(get_field "$output" "new_license_type")" "false" "COMMIT should not trigger MIT detection" -} test_excluded_paths_skipped() { setup_branch @@ -459,71 +413,6 @@ JAVA assert_eq "$exit_code" "0" "Sentry copyright should not trigger attribution" } -test_new_license_type_detected() { - setup_branch - mkdir -p src - cat > src/MplFile.java << 'JAVA' -// Adapted from MplLib. -// Copyright 2024 Mpl Author. -// Licensed under the Mozilla Public License. -package com.example; -public class MplFile {} -JAVA - git add src/MplFile.java - git commit -m "Add MPL file" --quiet - - local output - output=$(run_script) || true - assert_eq "$(get_field "$output" "new_license_type")" "true" "MPL should be flagged as new license type" -} - -# When KNOWN_LICENSES contains a versioned form (e.g. "EPL 2.0") but -# detect_license_type emits a shorter label (e.g. "EPL"), the exact-match -# grep -qixF won't match. This is a known false positive (safe direction): -# the script flags new_license_type: true even though the license family is -# already represented, and the LLM emits an extra "verify compatibility" -# reminder. -test_new_license_type_false_positive_versioned_heading() { - cat > THIRD_PARTY_NOTICES.md << 'NOTICES' -# Third-Party Notices - -## Example Library (Apache 2.0) - -- Source: https://github.com/example/library -- License: Apache License 2.0 -- Copyright: Copyright 2024 Example Inc. - ---- - -## EplLib (EPL 2.0) - -- Source: https://github.com/example/epllib -- License: Eclipse Public License 2.0 -- Copyright: Copyright 2024 EPL Author. -NOTICES - git add THIRD_PARTY_NOTICES.md - git commit -m "Add EPL 2.0 entry" --quiet --amend - - setup_branch - mkdir -p src - cat > src/EplVendored.java << 'JAVA' -// Adapted from AnotherEplLib. -// Copyright 2024 Another EPL Author. -// Licensed under the Eclipse Public License. -package com.example; -public class EplVendored {} -JAVA - git add src/EplVendored.java - git commit -m "Add EPL file" --quiet - - local output ok=true - output=$(run_script) || true - # detect_license_type returns "EPL" but KNOWN_LICENSES has "EPL 2.0" — - # exact match fails, so this is flagged as a new license type (false positive). - assert_eq "$(get_field "$output" "new_license_type")" "true" \ - "known false positive: 'EPL' != 'EPL 2.0' under exact match" || ok=false - [[ "$ok" == "true" ]] -} test_notices_staged_change_detected() { setup_branch @@ -545,7 +434,7 @@ JAVA local output output=$(run_script) || true - assert_eq "$(get_field "$output" "notices_file_changed")" "true" "staged notices change should be detected" + assert_contains "$output" "notices_file_changed: true" "staged notices change should be detected" } test_modified_vendor_path_file_attribution_changed() { @@ -755,194 +644,14 @@ JAVA local output exit_code=0 ok=true output=$(run_script) || exit_code=$? assert_eq "$exit_code" "10" "should still find candidates" || ok=false - assert_eq "$(get_field "$output" "notices_file_exists")" "false" "notices_file_exists" || ok=false - assert_eq "$(get_field "$output" "notices_match")" "none" "notices_match" || ok=false - assert_eq "$(get_field "$output" "new_license_type")" "false" "new_license_type should be false when no notices file" || ok=false - [[ "$ok" == "true" ]] -} - -test_progressive_url_matching() { - setup_branch - mkdir -p src - cat > src/DeepUrl.java << 'JAVA' -// Adapted from Example Library. -// Copyright 2024 Example Inc. -// Licensed under the Apache License 2.0. -// https://github.com/example/library/tree/main/src/Foo.java -package com.example; -public class DeepUrl {} -JAVA - git add src/DeepUrl.java - git commit -m "Add file with deep URL" --quiet - - local output ok=true - output=$(run_script) || true - assert_eq "$(get_field "$output" "notices_match")" "url" "deep URL should match via progressive trimming" || ok=false - assert_contains "$(get_field "$output" "notices_entry")" "Example Library" "notices_entry" || ok=false - [[ "$ok" == "true" ]] -} - -test_notices_entry_removed() { - cat > THIRD_PARTY_NOTICES.md << 'NOTICES' -# Third-Party Notices - -## Example Library (Apache 2.0) - -**Source:** https://github.com/example/library -**License:** Apache License 2.0 -**Copyright:** Copyright 2024 Example Inc. - -### Scope - -The code resides in `com.example.Foo`. - ---- - -## Other Library (MIT) - -**Source:** https://github.com/other/library -**License:** MIT License -**Copyright:** Copyright 2024 Other Inc. - -### Scope - -The code resides in `com.other.Bar`. -NOTICES - git add THIRD_PARTY_NOTICES.md - git commit -m "Two NOTICES entries" --quiet --amend - - setup_branch - - cat > THIRD_PARTY_NOTICES.md << 'NOTICES' -# Third-Party Notices - -## Example Library (Apache 2.0) - -**Source:** https://github.com/example/library -**License:** Apache License 2.0 -**Copyright:** Copyright 2024 Example Inc. - -### Scope - -The code resides in `com.example.Foo`. -NOTICES - git add THIRD_PARTY_NOTICES.md - git commit -m "Remove Other Library entry" --quiet - - local output exit_code=0 ok=true - output=$(run_script) || exit_code=$? - assert_eq "$exit_code" "10" "should exit 10 when NOTICES entry removed" || ok=false - assert_contains "$output" "candidate_type: notices_entry" "should have notices_entry candidate" || ok=false - assert_contains "$output" "notices_change: removed" "should flag as removed" || ok=false - assert_contains "$output" "## Other Library (MIT)" "should identify removed entry" || ok=false - local scope - scope=$(get_scope_text "$output") - assert_contains "$scope" "com.other.Bar" "scope should reference source files" || ok=false + assert_contains "$output" "notices_file_exists: false" "global notices_file_exists" || ok=false [[ "$ok" == "true" ]] } -test_notices_entry_modified() { - cat > THIRD_PARTY_NOTICES.md << 'NOTICES' -# Third-Party Notices - -## Example Library (Apache 2.0) - -**Source:** https://github.com/example/library -**License:** Apache License 2.0 -**Copyright:** Copyright 2024 Example Inc. - -### Scope - -The code resides in `com.example.Foo`. -NOTICES - git add THIRD_PARTY_NOTICES.md - git commit -m "NOTICES with entry" --quiet --amend - setup_branch - cat > THIRD_PARTY_NOTICES.md << 'NOTICES' -# Third-Party Notices -## Example Library (Apache 2.0) -**Source:** https://github.com/example/library -**License:** Apache License 2.0 -**Copyright:** Copyright 2025 Example Inc. - -### Scope - -The code resides in `com.example.Foo`. -NOTICES - git add THIRD_PARTY_NOTICES.md - git commit -m "Change copyright year in NOTICES" --quiet - - local output exit_code=0 ok=true - output=$(run_script) || exit_code=$? - assert_eq "$exit_code" "10" "should exit 10 when NOTICES entry modified" || ok=false - assert_contains "$output" "candidate_type: notices_entry" "should have notices_entry candidate" || ok=false - assert_contains "$output" "notices_change: modified" "should flag as modified" || ok=false - assert_contains "$output" "## Example Library (Apache 2.0)" "should identify modified entry" || ok=false - [[ "$ok" == "true" ]] -} - -test_spdx_license_detection() { - setup_branch - mkdir -p src - cat > src/SpdxFile.java << 'JAVA' -// SPDX-License-Identifier: MIT -// Adapted from SpdxLib. -// Copyright 2024 Spdx Author. -package com.example; -public class SpdxFile {} -JAVA - git add src/SpdxFile.java - git commit -m "Add file with SPDX identifier" --quiet - - local output ok=true - output=$(run_script) || true - assert_eq "$(get_field "$output" "new_license_type")" "true" \ - "MIT via SPDX should be flagged as new license type (only Apache 2.0 in NOTICES)" || ok=false - [[ "$ok" == "true" ]] -} - -test_spdx_license_matches_known() { - setup_branch - mkdir -p src - cat > src/SpdxApache.java << 'JAVA' -// SPDX-License-Identifier: Apache-2.0 -// Adapted from SpdxApacheLib. -// Copyright 2024 SpdxApache Author. -package com.example; -public class SpdxApache {} -JAVA - git add src/SpdxApache.java - git commit -m "Add file with Apache SPDX identifier" --quiet - - local output ok=true - output=$(run_script) || true - assert_eq "$(get_field "$output" "new_license_type")" "false" \ - "Apache-2.0 via SPDX should match known Apache 2.0 license type" || ok=false - [[ "$ok" == "true" ]] -} - -test_source_file_has_candidate_type() { - setup_branch - mkdir -p src - cat > src/Typed.java << 'JAVA' -// Adapted from TypedLib. -// Copyright 2024 Typed Author. -// Licensed under the Apache License 2.0. -package com.example; -public class Typed {} -JAVA - git add src/Typed.java - git commit -m "Add typed file" --quiet - - local output ok=true - output=$(run_script) || true - assert_contains "$output" "candidate_type: source_file" "should include candidate_type field" || ok=false - [[ "$ok" == "true" ]] -} # --- Run all tests --- @@ -958,13 +667,9 @@ run_test "Staged modification detected" test_staged_modification_de run_test "Untracked file detected" test_untracked_file_detected run_test "THIRD_PARTY_NOTICES.md change — committed" test_notices_file_changed_true run_test "THIRD_PARTY_NOTICES.md change — staged" test_notices_staged_change_detected -run_test "URL matching to notices entries" test_notices_url_matching -run_test "MIT — no false positive from COMMIT" test_mit_no_false_positive_from_commit run_test "Excluded paths skipped" test_excluded_paths_skipped run_test "Generated files skipped" test_generated_files_skipped run_test "Sentry copyright not flagged" test_sentry_copyright_not_flagged -run_test "New license type detected" test_new_license_type_detected -run_test "New license type — false positive on versioned heading" test_new_license_type_false_positive_versioned_heading run_test "License-only header not flagged" test_license_only_header_not_flagged run_test "Binary file skipped" test_binary_file_skipped run_test "Modified vendor-path file — attribution changed" test_modified_vendor_path_file_attribution_changed @@ -973,12 +678,6 @@ run_test "Modified first-party file — Sentry license header not flagged" test_ run_test "Unstaged modification detected" test_unstaged_modification_detected run_test "Multiple candidates in single run" test_multiple_candidates run_test "Missing THIRD_PARTY_NOTICES.md" test_missing_notices_file -run_test "Progressive URL matching" test_progressive_url_matching -run_test "NOTICES entry removed" test_notices_entry_removed -run_test "NOTICES entry modified" test_notices_entry_modified -run_test "SPDX license detection — new type" test_spdx_license_detection -run_test "SPDX license detection — matches known" test_spdx_license_matches_known -run_test "Source-file candidate has candidate_type" test_source_file_has_candidate_type # --- Summary --- From e373005fcf2667c2fc7b0b8d1f0acb02b478e369 Mon Sep 17 00:00:00 2001 From: Adam Brown Date: Thu, 14 May 2026 14:50:18 +0200 Subject: [PATCH 07/10] Loosen LLM output directives --- .../skills/check-code-attribution/SKILL.md | 116 +++++------------- .../find-attribution-candidates.sh | 1 - .../test/test-find-attribution-candidates.sh | 4 - 3 files changed, 31 insertions(+), 90 deletions(-) diff --git a/.claude/skills/check-code-attribution/SKILL.md b/.claude/skills/check-code-attribution/SKILL.md index d44eb16f394..9e7708d455f 100644 --- a/.claude/skills/check-code-attribution/SKILL.md +++ b/.claude/skills/check-code-attribution/SKILL.md @@ -16,7 +16,7 @@ Run the pre-filter script: bash .claude/skills/check-code-attribution/find-attribution-candidates.sh ``` -If the script exits with code 0, there are no candidates. Print "No attribution issues found." +If the script exits with code 0, there are no candidates. Print "✅ No attribution issues found." If the script exits with any code besides 0 or 10, it failed (e.g., could not determine merge-base). Print the stderr output and **stop**. @@ -32,14 +32,12 @@ If the script exits with code 10, it outputs: ``` --- file: - status: A|M|D|R + status: A|M|D|R (i.e., "added", "modified", "deleted", "renamed") reasons: --- ``` -The script handles candidate identification deterministically: changed-file collection across committed/staged/unstaged layers, path and generated-file exclusions, binary detection, vendor-path detection, attribution-marker detection, and first-party copyright filtering. - -**Important:** The script checks three layers of changes: (1) committed changes on the branch vs. the merge-base, (2) staged but uncommitted changes, and (3) unstaged working-tree changes. A candidate may not appear in `git diff merge-base..HEAD` if it is only staged or only in the working tree. Do NOT dismiss a candidate as a false positive just because it is absent from the committed branch diff — check `git status`, `git diff --cached`, and `git diff` (unstaged) to see the full picture. +The script handles candidate identification deterministically — including committed, staged, and unstaged changes — so trust its output. Do not dismiss a candidate as a false positive based on the committed diff alone. Parse the output and proceed to Step 2. @@ -51,7 +49,7 @@ If `notices_file_exists` is `true`, read `THIRD_PARTY_NOTICES.md` to understand ## Step 3: Analyze Each Candidate -**Skip analysis for deleted files** (`status: D`) — go straight to the finding: "Deleted vendored file — verify `THIRD_PARTY_NOTICES.md` is still accurate." +**Skip analysis for deleted files** (`status: D`) — they only need a 👀 verify finding. For each non-deleted candidate: @@ -63,99 +61,47 @@ For each non-deleted candidate: ## Step 4: Check for NOTICES Entry Changes -If `notices_file_changed` is `true`, compare old vs. new `THIRD_PARTY_NOTICES.md`: - -```bash -git show $(git merge-base HEAD origin/main 2>/dev/null || git merge-base HEAD main):THIRD_PARTY_NOTICES.md -``` - -Check for: +If `notices_file_changed` is `true`, retrieve the merge-base version of `THIRD_PARTY_NOTICES.md` and compare it against the current version. Skip entries whose source files were already analyzed as candidates in Step 3. -1. **Removed entries** — Headings present in the old version but absent in the new. For each, check whether the source files referenced in the entry's Scope section still exist in the repo (use `Glob` or `Read`). If they still exist and contain third-party attribution headers, flag: the NOTICES entry was removed but vendored source files still reference this library. If the referenced files were also deleted in this branch, note it as informational only. +1. **Removed entries** — Headings present in the old version but absent in the new. Check whether the source files referenced in the entry's Scope section still exist and contain third-party attribution headers. If so, flag it. If the referenced files were also deleted in this branch, note it as informational only. -2. **Modified entries** — Headings present in both but with changed content. Read the source files referenced in the entry's Scope section and compare the entry's metadata (Source, License, Copyright) against the source file headers. Flag any inconsistencies. If the entry is consistent with the source files, note it as informational only. +2. **Modified entries** — Headings present in both but with changed content. Compare the entry's metadata (Source, License, Copyright) against the source file headers. Flag inconsistencies; note consistent changes as informational only. ## Step 5: Output Results -### No issues found - -Print "No attribution issues found." - -### Issues found - -Print findings to the terminal, grouped by file. Prefix lines that require immediate action with ⚠️. Informational reminders (verify, check) get a 👀 prefix. - -Use the following format (only include lines that are relevant; number each entry; omit entries if the user doesn't have to do anything; omit Action Items or False Positives sections if none found; put each bullet point on its own line; wrap lines when they reach the edge of the "Outcome of check-code-attribution" box): - -``` -********************************************************************************** -* Outcome of check-code-attribution * -********************************************************************************** - ------------------------------------ Action items --------------------------------- - -1. ⚠️ File: - Vendored code detected () — missing required fields: - - - - -``` - -For files where attribution markers were removed: -``` -1. ⚠️ File: - Required attribution field(s) removed: - - - - -``` - -For files with a matching THIRD_PARTY_NOTICES.md entry: -``` -1. 👀 File: - Vendored code detected () – verify that `THIRD_PARTY_NOTICES.md` reflects your updates. -``` - -For renamed files: -``` -1. 👀 File: - Vendored file renamed – Verify `THIRD_PARTY_NOTICES.md` reflects your updates. -``` +If there are no issues, print: -For deleted files: ``` -1. 👀 File: - Deleted vendored file – Verify `THIRD_PARTY_NOTICES.md` reflects your updates. +✅ No attribution issues found. ``` -For removed NOTICES entries where source files still exist: -``` -1. ⚠️ NOTICES entry removed: - Source file(s) still reference this library: - - `` still contains attribution header for . Either restore the `THIRD_PARTY_NOTICES.md` entry or remove the vendored code. -``` +Otherwise, print findings as a numbered list. Use fully qualified class names (e.g., `io.sentry.cache.tape.FileObjectQueue`). Guidelines: -For modified NOTICES entries with inconsistencies: -``` -1. ⚠️ NOTICES entry modified: - Entry metadata inconsistent with source file headers: - - -``` +- **⚠️** = must fix before merging (missing fields, stripped attribution, inconsistent or orphaned NOTICES entries). +- **👀** = author should verify (deleted/renamed files, matched NOTICES entries, consistent NOTICES modifications). +- Keep license-header issues and `THIRD_PARTY_NOTICES.md` issues in separate bullets. +- For license types not yet in NOTICES, link https://open.sentry.io/licensing/ for compatibility review. +- Be concise — say what's wrong and what to do. +- If any candidates are false positives, list them at the end with a one-line reason each. -For modified NOTICES entries that are consistent: -``` -1. 👀 NOTICES entry modified: - Verify updated entry is consistent with source file headers. -``` +Example output: -For new license types: -``` - - ❗This license type is not yet represented in `THIRD_PARTY_NOTICES.md`. Please verify it is compatible with Sentry's licensing policies: https://open.sentry.io/licensing/. ``` +Code Attribution Check +══════════════════════ -``` +Urgent +──────────── +1. ⚠️ io.sentry.util.TokenBucket + Vendored code (Guava) — header is missing the source URL and copyright year. + - No corresponding `THIRD_PARTY_NOTICES.md` entry; add one. ----------------------------------- False positives ------------------------------- +Verify +────── +2. 👀 io.sentry.cache.tape.FileObjectQueue + Vendored code (Square Tape) — verify `THIRD_PARTY_NOTICES.md` reflects your updates. - +False positives +─────────────── +1. AGENTS.md — project documentation, not vendored code. ``` - -If there are no Action Items, print the following after *all* sections: "✅ Everything looks good. No attribution issues found." diff --git a/.claude/skills/check-code-attribution/find-attribution-candidates.sh b/.claude/skills/check-code-attribution/find-attribution-candidates.sh index 7e96a4ae186..d22e41901fb 100755 --- a/.claude/skills/check-code-attribution/find-attribution-candidates.sh +++ b/.claude/skills/check-code-attribution/find-attribution-candidates.sh @@ -121,7 +121,6 @@ has_removed_attribution_lines() { grep -E '^-' <<< "$diff_output" | grep -v '^--- ' | grep -qiE "$ATTRIBUTION_PATTERN" } - # Collect changed files from all sources (committed, staged, unstaged, untracked), # deduplicated by current filepath (first occurrence wins). The committed diff is listed # first so its status character takes precedence — a file committed as "A" (added) won't diff --git a/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh b/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh index d059b1ea750..55f45e6efc0 100644 --- a/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh +++ b/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh @@ -649,10 +649,6 @@ JAVA } - - - - # --- Run all tests --- run_test "Clean branch — no candidates" test_clean_branch_no_candidates From de1b3f94045e957b43426fb67bc0c621293156bd Mon Sep 17 00:00:00 2001 From: Adam Brown Date: Thu, 14 May 2026 15:05:49 +0200 Subject: [PATCH 08/10] Have LLM weigh in on license compatibility --- .../skills/check-code-attribution/SKILL.md | 29 ++++++++++++++----- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/.claude/skills/check-code-attribution/SKILL.md b/.claude/skills/check-code-attribution/SKILL.md index 9e7708d455f..f3b27bd82d0 100644 --- a/.claude/skills/check-code-attribution/SKILL.md +++ b/.claude/skills/check-code-attribution/SKILL.md @@ -57,7 +57,14 @@ For each non-deleted candidate: 2. **Match to a `THIRD_PARTY_NOTICES.md` entry** — Try to find a corresponding entry by URL, library name, copyright holder, or other context. Record whether you found a match, and if so, which entry. -3. **Check the license type** — Identify the license declared in the file's header. Check whether this license type is already represented in `THIRD_PARTY_NOTICES.md` headings. If it's a new license type not yet in NOTICES, flag it for compatibility review. +3. **Check license compatibility** — Identify the license in the file's header and classify it per Sentry's Open Source Legal Policy (https://www.notion.so/sentry/ac4885d265cb4d7898a01c060b061e42; public summary at https://open.sentry.io/licensing/). sentry-java is MIT-licensed. The policy defines four tiers: + - **Permissive** (MIT, BSD, Apache 2.0, ISC, CC-BY, etc.) — allowed. No action needed. + - **Weak copyleft** (LGPL, MPL, EPL, CDDL, etc.) — may be allowed for vendoring but requires review. Flag as **Critical** with a note to check the policy's permissions matrix. + - **Strong copyleft** (GPL, QPL, Sleepycat) — flag as **Critical**, requires legal review before vendoring. + - **AGPL** — **absolute ban**, must not be used at Sentry for any use case. Flag as **Critical** and block. + - **No license** — assume no permission to use. Flag as **Critical**. + + Also check whether this license type is already represented in `THIRD_PARTY_NOTICES.md` headings; if it's new, note it. ## Step 4: Check for NOTICES Entry Changes @@ -77,12 +84,14 @@ If there are no issues, print: Otherwise, print findings as a numbered list. Use fully qualified class names (e.g., `io.sentry.cache.tape.FileObjectQueue`). Guidelines: -- **⚠️** = must fix before merging (missing fields, stripped attribution, inconsistent or orphaned NOTICES entries). -- **👀** = author should verify (deleted/renamed files, matched NOTICES entries, consistent NOTICES modifications). +- **❗❗⚠️ ❗❗** = license issue (AGPL, strong copyleft, unlicensed code). Goes in the **Critical** section. +- **⚠️** = must fix before merging (missing fields, stripped attribution, inconsistent or orphaned NOTICES entries). Goes in the **Urgent** section. +- **👀** = author should verify (deleted/renamed files, matched NOTICES entries, consistent NOTICES modifications, weak copyleft or new license type). Goes in the **Verify** section. - Keep license-header issues and `THIRD_PARTY_NOTICES.md` issues in separate bullets. -- For license types not yet in NOTICES, link https://open.sentry.io/licensing/ for compatibility review. +- For license concerns, link the policy: https://open.sentry.io/licensing/ - Be concise — say what's wrong and what to do. - If any candidates are false positives, list them at the end with a one-line reason each. +- Omit any section that has no entries. Example output: @@ -90,15 +99,21 @@ Example output: Code Attribution Check ══════════════════════ +Critical +──────── +1. ❗❗⚠️❗❗ io.sentry.util.AgplHelper + AGPL-licensed code — absolute ban per Sentry policy. Must be removed. + - Policy: https://www.notion.so/sentry/ac4885d265cb4d7898a01c060b061e42 + Urgent -──────────── -1. ⚠️ io.sentry.util.TokenBucket +────── +2. ⚠️ io.sentry.util.TokenBucket Vendored code (Guava) — header is missing the source URL and copyright year. - No corresponding `THIRD_PARTY_NOTICES.md` entry; add one. Verify ────── -2. 👀 io.sentry.cache.tape.FileObjectQueue +3. 👀 io.sentry.cache.tape.FileObjectQueue Vendored code (Square Tape) — verify `THIRD_PARTY_NOTICES.md` reflects your updates. False positives From 4dbb4fb78f06be174648f8020f7e3c5e87649a37 Mon Sep 17 00:00:00 2001 From: Adam Brown Date: Thu, 14 May 2026 15:31:23 +0200 Subject: [PATCH 09/10] Prompt and output refinements based on manual testing feedback --- .../skills/check-code-attribution/SKILL.md | 81 ++-- .../find-attribution-candidates.sh | 133 +++--- .../test/test-find-attribution-candidates.sh | 391 +++++++++++++++--- 3 files changed, 475 insertions(+), 130 deletions(-) diff --git a/.claude/skills/check-code-attribution/SKILL.md b/.claude/skills/check-code-attribution/SKILL.md index f3b27bd82d0..1b511ef0afe 100644 --- a/.claude/skills/check-code-attribution/SKILL.md +++ b/.claude/skills/check-code-attribution/SKILL.md @@ -1,7 +1,7 @@ --- name: check-code-attribution description: Check vendored code attributions in branch diff and flag any that are deficient. Use when asked to "check attribution", "check licenses", "verify vendored code attribution", or "check code attribution". -allowed-tools: Bash, Read, Grep, Glob +allowed-tools: Bash, Read --- # Check Code Attribution @@ -16,30 +16,30 @@ Run the pre-filter script: bash .claude/skills/check-code-attribution/find-attribution-candidates.sh ``` -If the script exits with code 0, there are no candidates. Print "✅ No attribution issues found." +Stdout **always** starts with **global metadata** (first two lines): + +``` +notices_file_exists: true|false +notices_file_changed: true|false +``` If the script exits with any code besides 0 or 10, it failed (e.g., could not determine merge-base). Print the stderr output and **stop**. -If the script exits with code 10, it outputs: +If the script exits with code **0**, there are no file candidates **and** `THIRD_PARTY_NOTICES.md` is unchanged vs the merge-base. Print "✅ No attribution issues found." and **stop**. -1. **Global metadata** (first two lines): - ``` - notices_file_exists: true|false - notices_file_changed: true|false - ``` +If the script exits with code **10**, there is at least one file candidate **and/or** `THIRD_PARTY_NOTICES.md` changed (including NOTICES-only edits that produce **zero** candidate blocks). After the two metadata lines, stdout may contain zero or more **candidate blocks**: -2. **Candidate blocks** (one per flagged file): - ``` - --- - file: - status: A|M|D|R (i.e., "added", "modified", "deleted", "renamed") - reasons: - --- - ``` +``` +--- +file: +status: A|M|D|R (i.e., "added", "modified", "deleted", "renamed") +reasons: +--- +``` The script handles candidate identification deterministically — including committed, staged, and unstaged changes — so trust its output. Do not dismiss a candidate as a false positive based on the committed diff alone. -Parse the output and proceed to Step 2. +Parse the metadata and any candidate blocks, then proceed to Step 2. If there are **zero** candidate blocks but `notices_file_changed` is `true`, skip Step 3 and still run Step 4. ## Step 2: Gather Context @@ -53,14 +53,14 @@ If `notices_file_exists` is `true`, read `THIRD_PARTY_NOTICES.md` to understand For each non-deleted candidate: -1. **Read the file** and check for the required attribution fields from `CODE_ATTRIBUTION_CRITERIA.md`. Extra information beyond the required fields is fine, as are differences in formatting. Only flag **missing** fields. +1. **Read the file** and check for the required attribution fields from `CODE_ATTRIBUTION_CRITERIA.md`: The criteria shows a canonical template, but the exact wording, comment style, and formatting don't need to match exactly. Only flag **missing** fields. For candidates whose reasons include "removed", also read the merge-base version (`git show "$MB:"`) to see what attribution was there before — you need both versions to determine whether attribution was stripped vs. never present. 2. **Match to a `THIRD_PARTY_NOTICES.md` entry** — Try to find a corresponding entry by URL, library name, copyright holder, or other context. Record whether you found a match, and if so, which entry. -3. **Check license compatibility** — Identify the license in the file's header and classify it per Sentry's Open Source Legal Policy (https://www.notion.so/sentry/ac4885d265cb4d7898a01c060b061e42; public summary at https://open.sentry.io/licensing/). sentry-java is MIT-licensed. The policy defines four tiers: - - **Permissive** (MIT, BSD, Apache 2.0, ISC, CC-BY, etc.) — allowed. No action needed. - - **Weak copyleft** (LGPL, MPL, EPL, CDDL, etc.) — may be allowed for vendoring but requires review. Flag as **Critical** with a note to check the policy's permissions matrix. - - **Strong copyleft** (GPL, QPL, Sleepycat) — flag as **Critical**, requires legal review before vendoring. +3. **Check license compatibility** — Identify the license in the file's header and classify it. sentry-java is MIT-licensed. Sentry's Open Source Legal Policy (https://open.sentry.io/licensing/) defines four tiers: + - **Permissive** (MIT, BSD, Apache 2.0, ISC, CC-BY, CC0, Unlicense, WTFPL, Zlib, etc.) — allowed. No action needed. + - **Weak copyleft** (LGPL, MPL, EPL, CDDL, CPL, etc.) — may be allowed for vendoring but requires review. Flag as **Critical** with a note to verify against the policy. + - **Strong copyleft** (GPL, QPL, Sleepycat, OSL, etc.) — flag as **Critical**, requires legal review before vendoring. - **AGPL** — **absolute ban**, must not be used at Sentry for any use case. Flag as **Critical** and block. - **No license** — assume no permission to use. Flag as **Critical**. @@ -68,11 +68,18 @@ For each non-deleted candidate: ## Step 4: Check for NOTICES Entry Changes -If `notices_file_changed` is `true`, retrieve the merge-base version of `THIRD_PARTY_NOTICES.md` and compare it against the current version. Skip entries whose source files were already analyzed as candidates in Step 3. +If `notices_file_changed` is `true`, compare the merge-base revision of `THIRD_PARTY_NOTICES.md` to the current file. Resolve merge-base the same way as `find-attribution-candidates.sh` (`origin/main`, else `main`): + +```bash +MB=$(git merge-base HEAD origin/main 2>/dev/null || git merge-base HEAD main) +``` + +- **Old (merge-base) content:** `git show "$MB:THIRD_PARTY_NOTICES.md"` — if this fails (file absent at merge-base), treat the old side as empty. +- **New (current) content:** read `THIRD_PARTY_NOTICES.md` from the repo root when `notices_file_exists` is `true`. When `notices_file_exists` is `false`, the file is gone from the worktree (for example deleted on the branch); treat the new side as empty so every merge-base entry shows up as removed for analysis. -1. **Removed entries** — Headings present in the old version but absent in the new. Check whether the source files referenced in the entry's Scope section still exist and contain third-party attribution headers. If so, flag it. If the referenced files were also deleted in this branch, note it as informational only. +Compare old vs. new and verify that every `THIRD_PARTY_NOTICES.md` entry is consistent with the source file headers from the diff: metadata matches, no orphaned or missing entries, no stale Scope paths. Skip entries whose source files were already analyzed in Step 3 — they're covered there. Any entries with new license types (e.g., AGPL where no other entry has an AGPL license) must be flagged as **Critical**. -2. **Modified entries** — Headings present in both but with changed content. Compare the entry's metadata (Source, License, Copyright) against the source file headers. Flag inconsistencies; note consistent changes as informational only. +**Also check even when `notices_file_changed` is `false`:** if the branch deletes or renames a source file (status D or R), verify that the corresponding NOTICES entry was updated or removed. This catches the case where NOTICES *should* have changed but didn't. ## Step 5: Output Results @@ -84,13 +91,14 @@ If there are no issues, print: Otherwise, print findings as a numbered list. Use fully qualified class names (e.g., `io.sentry.cache.tape.FileObjectQueue`). Guidelines: -- **❗❗⚠️ ❗❗** = license issue (AGPL, strong copyleft, unlicensed code). Goes in the **Critical** section. +- **🚨** = license issue (AGPL, strong copyleft, weak copyleft, new license type, unlicensed code). Goes in the **Critical** section. - **⚠️** = must fix before merging (missing fields, stripped attribution, inconsistent or orphaned NOTICES entries). Goes in the **Urgent** section. -- **👀** = author should verify (deleted/renamed files, matched NOTICES entries, consistent NOTICES modifications, weak copyleft or new license type). Goes in the **Verify** section. +- **👀** = author should verify (deleted/renamed files, matched NOTICES entries, consistent NOTICES modifications). Goes in the **Verify** section. - Keep license-header issues and `THIRD_PARTY_NOTICES.md` issues in separate bullets. - For license concerns, link the policy: https://open.sentry.io/licensing/ -- Be concise — say what's wrong and what to do. +- Be **very, very** concise — say what's wrong and what to do in as few words as possible! - If any candidates are false positives, list them at the end with a one-line reason each. +- Separate each numbered entry with an empty line for readability (see example "Urgent" output below). - Omit any section that has no entries. Example output: @@ -101,22 +109,27 @@ Code Attribution Check Critical ──────── -1. ❗❗⚠️❗❗ io.sentry.util.AgplHelper +1. 🚨 io.sentry.util.AgplHelper AGPL-licensed code — absolute ban per Sentry policy. Must be removed. - - Policy: https://www.notion.so/sentry/ac4885d265cb4d7898a01c060b061e42 + - Policy: https://open.sentry.io/licensing/ Urgent ────── 2. ⚠️ io.sentry.util.TokenBucket - Vendored code (Guava) — header is missing the source URL and copyright year. + Vendored code (Guava) — header is missing the source URL and copyright + year. - No corresponding `THIRD_PARTY_NOTICES.md` entry; add one. +3. ⚠️ io.sentry.android.core.ANRWatchDog + MIT license header was stripped. Restore the attribution header. + Verify ────── -3. 👀 io.sentry.cache.tape.FileObjectQueue - Vendored code (Square Tape) — verify `THIRD_PARTY_NOTICES.md` reflects your updates. +4. 👀 io.sentry.cache.tape.FileObjectQueue + Vendored code (Square Tape) — verify `THIRD_PARTY_NOTICES.md` reflects + your updates. False positives ─────────────── -1. AGENTS.md — project documentation, not vendored code. +a. AGENTS.md — project documentation, not vendored code. ``` diff --git a/.claude/skills/check-code-attribution/find-attribution-candidates.sh b/.claude/skills/check-code-attribution/find-attribution-candidates.sh index d22e41901fb..00eddbf1d23 100755 --- a/.claude/skills/check-code-attribution/find-attribution-candidates.sh +++ b/.claude/skills/check-code-attribution/find-attribution-candidates.sh @@ -6,8 +6,7 @@ # script handles cheap, deterministic identification and filtering; all interpretation (cross- # referencing with THIRD_PARTY_NOTICES.md, license classification, etc.) is left to the LLM. # -# When candidates are found, the output starts with global metadata followed by one block per -# candidate: +# Every run prints global metadata first (two lines), then zero or more candidate blocks: # # notices_file_exists: true|false # notices_file_changed: true|false @@ -17,7 +16,9 @@ # reasons: # --- # -# Exit code 0 = no candidates found, exit code 10 = candidates found. +# Exit code 0 = no file candidates and THIRD_PARTY_NOTICES.md unchanged vs merge-base. +# Exit code 10 = one or more file candidates and/or THIRD_PARTY_NOTICES.md changed (including +# NOTICES-only edits with zero candidate blocks when diff hunks do not match attribution patterns). set -euo pipefail @@ -42,26 +43,28 @@ VENDORING_MARKERS='adapted from|backported from|copied from|derived from|ported # Recognized open-source license names (regex alternation) LICENSE_NAMES='Apache 2\.0|Apache License|BSD [0-9]|BSD License|CC-BY|Creative Commons|Eclipse Public License|EPL|GNU General Public|GPL|ISC License|LGPL|MIT License|Mozilla Public|Public Domain|SPDX-License-Identifier|Unlicense' -# Combined pattern for diff-hunk scanning of modified files (includes generic terms -# "copyright" and "licensed" which are appropriate for individual changed lines) +# Combined pattern for diff-hunk scanning of modified files. Intentionally broader +# than VENDORING_MARKERS: it includes generic terms ("copyright", "licensed") so the +# diff-hunk scan catches any attribution-related change. False positives from first- +# party headers are filtered out downstream by has_third_party_attribution() checks +# against the full file content (for additions) or merge-base content (for removals). ATTRIBUTION_PATTERN="$VENDORING_MARKERS|copyright|licensed|$LICENSE_NAMES" # Sentry entity names — copyright lines mentioning these are treated as first-party SENTRY_ENTITIES='functional software|getsentry|sentry software' +# Build sed expression from SENTRY_ENTITIES (keeps entity list and strip patterns in sync) +_sentry_sed="" +IFS='|' read -ra _sentry_parts <<< "$SENTRY_ENTITIES" +for _part in "${_sentry_parts[@]}"; do + _sentry_sed+="s/${_part}//g; " +done +SENTRY_STRIP_SED="${_sentry_sed}s/sentry//g; s/copyright//g; s/(c)//g" +unset _sentry_sed _sentry_parts _part + # Path segments that suggest vendored/external code VENDOR_PATH_MARKERS='external|shaded|third-party|third_party|thirdparty|vendor|vendored' -# --- Global metadata (emitted once before any candidates) --- -global_metadata_emitted=false -emit_global_metadata() { - if [[ "$global_metadata_emitted" == "false" ]]; then - echo "notices_file_exists: $notices_file_exists" - echo "notices_file_changed: $notices_file_changed" - global_metadata_emitted=true - fi -} - is_binary_file() { # -I treats binary files as non-matching → exit 1; empty pattern matches any text file → exit 0 ! grep -Iq '' "$1" 2>/dev/null @@ -73,13 +76,26 @@ is_excluded_path() { [[ "$1" =~ ^\.claude/ || "$1" =~ ^\.github/ || "$1" =~ ^\.gradle/ || "$1" =~ ^\.idea/ || "$1" =~ ^\.mvn/ || "$1" =~ ^buildSrc/ || "$1" =~ ^build-logic/ || "$1" =~ ^gradle/ ]] } -# Load exclusion patterns once into a temp file (regexes from a repo-controlled file — review -# changes carefully). Using a file avoids re-creating a process substitution per candidate. -EXCLUSION_PATTERNS_FILE=$(mktemp) -CONTENT_FILE=$(mktemp) -trap 'rm -f "$EXCLUSION_PATTERNS_FILE" "$CONTENT_FILE"' EXIT +# Load exclusion patterns once into a temp file. Each pattern is validated before inclusion: +# invalid regexes are skipped with a warning, and patterns over 200 chars are rejected to +# limit ReDoS surface (the file is repo-controlled, but validation catches accidental breakage). +WORK_DIR=$(mktemp -d) +trap 'rm -rf "$WORK_DIR"' EXIT +EXCLUSION_PATTERNS_FILE="$WORK_DIR/exclusions" if [[ -f "$EXCLUSIONS_FILE" ]]; then - grep -v '^#' "$EXCLUSIONS_FILE" | grep -v '^$' > "$EXCLUSION_PATTERNS_FILE" || true + while IFS= read -r pattern; do + [[ -z "$pattern" || "$pattern" == \#* ]] && continue + if [[ ${#pattern} -gt 200 ]]; then + echo "Warning: skipping exclusion pattern exceeding 200 chars: ${pattern:0:40}..." >&2 + continue + fi + rc=0; printf '' | grep -qE -- "$pattern" 2>/dev/null || rc=$? + if [[ $rc -eq 2 ]]; then + echo "Warning: skipping invalid regex in exclusions: $pattern" >&2 + continue + fi + printf '%s\n' "$pattern" + done < "$EXCLUSIONS_FILE" > "$EXCLUSION_PATTERNS_FILE" fi is_generated_file() { @@ -98,10 +114,23 @@ has_third_party_attribution() { # Vendoring markers are strong standalone indicators grep -qiE "$VENDORING_MARKERS" "$filepath" && return 0 - # A non-Sentry copyright line is a strong standalone indicator. - # Note: a line like "Copyright Sentry and Example Corp" is excluded because the Sentry entity - # matches — split dual-copyright lines onto separate lines in the source to avoid this. - grep -iE 'copyright' "$filepath" | grep -qivE "$SENTRY_ENTITIES" && return 0 + local copyright_lines + copyright_lines=$(grep -iE 'copyright' "$filepath" 2>/dev/null) || true + [[ -z "$copyright_lines" ]] && return 1 + + # Fast path: any copyright line without a Sentry entity is definitively third-party + echo "$copyright_lines" | grep -qivE "$SENTRY_ENTITIES" && return 0 + + # Slow path: all copyright lines mention a Sentry entity. Check for dual-copyright + # lines (e.g., "Copyright Functional Software and Example Corp") by stripping Sentry + # names and common boilerplate, then looking for remaining substantive words. + echo "$copyright_lines" | \ + tr '[:upper:]' '[:lower:]' | \ + sed "$SENTRY_STRIP_SED" | \ + sed 's/[0-9]//g; s/[^a-z]/ /g' | \ + tr -s ' ' '\n' | \ + grep -vxE '(and|the|inc|llc|ltd|or|of|by|all|rights|reserved)' | \ + grep -qE '[a-z]{3,}' && return 0 # License keywords alone (without a non-Sentry copyright or vendoring marker) do NOT # indicate vendored code — many first-party files carry the project's own license header. @@ -112,19 +141,19 @@ has_third_party_attribution() { # Check if diff hunks contain added attribution-related lines has_added_attribution_lines() { local diff_output="$1" - grep -E '^\+' <<< "$diff_output" | grep -vE '^\+\+\+' | grep -qiE "$ATTRIBUTION_PATTERN" + grep -E '^\+' <<< "$diff_output" | grep -vE '^\+\+\+ (b/|/dev/null)' | grep -qiE "$ATTRIBUTION_PATTERN" } # Check if diff hunks contain removed attribution-related lines has_removed_attribution_lines() { local diff_output="$1" - grep -E '^-' <<< "$diff_output" | grep -v '^--- ' | grep -qiE "$ATTRIBUTION_PATTERN" + grep -E '^-' <<< "$diff_output" | grep -vE '^--- (a/|/dev/null)' | grep -qiE "$ATTRIBUTION_PATTERN" } # Collect changed files from all sources (committed, staged, unstaged, untracked), -# deduplicated by current filepath (first occurrence wins). The committed diff is listed -# first so its status character takes precedence — a file committed as "A" (added) won't -# be overridden by a staged or unstaged "M" (modified) for the same path. +# deduplicated by current filepath (last occurrence wins). Sources are listed oldest +# to newest, so the most recent state takes precedence — e.g. a file committed as +# "M" (modified) then staged for deletion resolves to "D". collect_changed_files() { { git diff "$MERGE_BASE"..HEAD --name-status --find-renames 2>/dev/null || true @@ -134,11 +163,13 @@ collect_changed_files() { [[ -n "$path" ]] && printf 'A\t%s\n' "$path" done } | awk -F'\t' '{ - # For renames (R###), the current path is the third field (new name). - # For everything else, the current path is the last field. if ($1 ~ /^R/ && NF >= 3) key = $3 else key = $NF - if (!seen[key]++) print + data[key] = $0 + if (!seen[key]++) order[++n] = key + } + END { + for (i = 1; i <= n; i++) print data[order[i]] }' } @@ -147,7 +178,7 @@ collect_changed_files() { # directly against the current worktree file. combined_diff() { local filepath="$1" - git diff "$MERGE_BASE" -- "$filepath" 2>/dev/null || true + git diff "$MERGE_BASE" -- "$filepath" 2>/dev/null || echo "Warning: git diff failed for $filepath" >&2 } # Check if THIRD_PARTY_NOTICES.md was modified in this branch (committed, staged, or unstaged) @@ -158,6 +189,11 @@ if git diff "$MERGE_BASE"..HEAD --name-only -- "$NOTICES_FILE" 2>/dev/null | gre notices_file_changed="true" fi +# Global metadata is always printed first so consumers can run NOTICES review even when there are +# zero file candidates (e.g. Scope-only edits to THIRD_PARTY_NOTICES.md). +echo "notices_file_exists: $notices_file_exists" +echo "notices_file_changed: $notices_file_changed" + found_any=false # Process each changed file @@ -174,6 +210,8 @@ while IFS=$'\t' read -r status filepath old_filepath; do filepath="$current_path" fi + # NOTICES changes are tracked via the notices_file_changed metadata field, not as a candidate. + [[ "$filepath" == "$NOTICES_FILE" ]] && continue is_excluded_path "$filepath" && continue is_generated_file "$filepath" && continue @@ -191,27 +229,27 @@ while IFS=$'\t' read -r status filepath old_filepath; do if [[ $(wc -c < "$filepath" 2>/dev/null) -gt 102400 ]]; then echo "Warning: $filepath exceeds 100KB — only the first 100KB will be scanned for attribution markers." >&2 fi - head -c 102400 "$filepath" > "$CONTENT_FILE" 2>/dev/null || continue + head -c 102400 "$filepath" > "$WORK_DIR/content" 2>/dev/null || continue if has_vendor_path "$filepath"; then is_candidate=true reasons+=("path suggests vendored code") fi - if has_third_party_attribution "$CONTENT_FILE"; then + if has_third_party_attribution "$WORK_DIR/content"; then is_candidate=true reasons+=("attribution markers in file") fi elif [[ "$status_char" == "D" ]]; then - git show "$MERGE_BASE":"$filepath" > "$CONTENT_FILE" 2>/dev/null || continue + git show "$MERGE_BASE":"$filepath" > "$WORK_DIR/content" 2>/dev/null || continue if has_vendor_path "$filepath"; then is_candidate=true reasons+=("deleted file in vendor path") fi - if has_third_party_attribution "$CONTENT_FILE"; then + if has_third_party_attribution "$WORK_DIR/content"; then is_candidate=true reasons+=("deleted file with attribution markers") fi @@ -221,14 +259,14 @@ while IFS=$'\t' read -r status filepath old_filepath; do if [[ $(wc -c < "$filepath" 2>/dev/null) -gt 102400 ]]; then echo "Warning: $filepath exceeds 100KB — only the first 100KB will be scanned for attribution markers." >&2 fi - head -c 102400 "$filepath" > "$CONTENT_FILE" 2>/dev/null || continue + head -c 102400 "$filepath" > "$WORK_DIR/content" 2>/dev/null || continue if has_vendor_path "$filepath" || has_vendor_path "${old_filepath:-}"; then is_candidate=true reasons+=("renamed file in vendor path") fi - if has_third_party_attribution "$CONTENT_FILE"; then + if has_third_party_attribution "$WORK_DIR/content"; then is_candidate=true reasons+=("renamed file with attribution markers") fi @@ -237,7 +275,7 @@ while IFS=$'\t' read -r status filepath old_filepath; do if [[ $(wc -c < "$filepath" 2>/dev/null) -gt 102400 ]]; then echo "Warning: $filepath exceeds 100KB — only the first 100KB will be scanned for attribution markers." >&2 fi - head -c 102400 "$filepath" > "$CONTENT_FILE" 2>/dev/null || continue + head -c 102400 "$filepath" > "$WORK_DIR/content" 2>/dev/null || continue diff_output=$(combined_diff "$filepath") has_added=false @@ -256,11 +294,17 @@ while IFS=$'\t' read -r status filepath old_filepath; do # match first-party headers too. When markers were only added (not removed), # filter out files whose full content has no third-party attribution — those # are Sentry's own license headers being added. - if ! has_third_party_attribution "$CONTENT_FILE" && ! has_vendor_path "$filepath"; then + if ! has_third_party_attribution "$WORK_DIR/content" && ! has_vendor_path "$filepath"; then continue fi reasons+=("attribution markers added") else + # Mirror the added-only guard: check the merge-base content for third-party + # attribution so we don't flag removal of Sentry's own copyright headers. + git show "$MERGE_BASE":"$filepath" > "$WORK_DIR/old_content" 2>/dev/null || continue + if ! has_third_party_attribution "$WORK_DIR/old_content" && ! has_vendor_path "$filepath"; then + continue + fi reasons+=("attribution markers removed") fi is_candidate=true @@ -275,9 +319,6 @@ while IFS=$'\t' read -r status filepath old_filepath; do # Format reasons as comma-separated string reasons_str=$(printf '%s, ' "${reasons[@]+${reasons[@]}}" | sed 's/, $//') - # Emit global metadata once before the first candidate - emit_global_metadata - # Output structured block echo "---" echo "file: $filepath" @@ -289,6 +330,6 @@ while IFS=$'\t' read -r status filepath old_filepath; do done < <(collect_changed_files) -if [[ "$found_any" == "true" ]]; then +if [[ "$found_any" == "true" ]] || [[ "$notices_file_changed" == "true" ]]; then exit 10 fi diff --git a/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh b/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh index 55f45e6efc0..db3b3685c54 100644 --- a/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh +++ b/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh @@ -6,7 +6,7 @@ # Each test creates a temporary git repo, sets up a specific scenario, # runs the script, and asserts on its output and exit code. # -# Usage: bash test-find-attribution-candidates.sh +# Usage: bash test/test-find-attribution-candidates.sh set -euo pipefail @@ -85,6 +85,20 @@ assert_contains() { return 1 } +# Script contract: stdout always begins with these two lines (in order). +assert_global_metadata_prefix() { + local output="$1" exists="$2" changed="$3" msg="$4" + assert_eq "$(echo "$output" | head -n 1)" "notices_file_exists: $exists" "${msg} — notices_file_exists line" || return 1 + assert_eq "$(echo "$output" | sed -n '2p')" "notices_file_changed: $changed" "${msg} — notices_file_changed line" || return 1 +} + +assert_line_count() { + local output="$1" expected="$2" msg="$3" + local count + count=$(printf '%s\n' "$output" | wc -l | tr -d '[:space:]') + assert_eq "$count" "$expected" "$msg" || return 1 +} + run_test() { local test_name="$1" test_fn="$2" TESTS_RUN=$((TESTS_RUN + 1)) @@ -122,9 +136,11 @@ test_clean_branch_no_candidates() { git add src/Clean.java git commit -m "Add clean file" --quiet - local exit_code=0 - run_script > /dev/null 2>&1 || exit_code=$? - assert_eq "$exit_code" "0" "should exit 0 when no candidates" + local output exit_code=0 + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "0" "should exit 0 when no candidates and NOTICES unchanged" || return 1 + assert_global_metadata_prefix "$output" true false "clean branch" || return 1 + assert_line_count "$output" 2 "exit 0 with no work should print exactly metadata lines" || return 1 } test_new_file_with_attribution() { @@ -144,6 +160,8 @@ JAVA local output exit_code=0 ok=true output=$(run_script) || exit_code=$? assert_eq "$exit_code" "10" "should exit 10 when candidates found" || ok=false + assert_global_metadata_prefix "$output" true false "candidate with NOTICES unchanged" || ok=false + assert_eq "$(echo "$output" | sed -n '3p')" "---" "metadata lines precede first candidate block" || ok=false assert_eq "$(get_field "$output" "file")" "src/Adapted.java" "file path" || ok=false assert_eq "$(get_field "$output" "status")" "A" "status" || ok=false assert_contains "$(get_field "$output" "reasons")" "attribution markers in file" "reasons" || ok=false @@ -157,13 +175,29 @@ test_new_file_in_vendor_path() { git add vendor/lib/Foo.java git commit -m "Add vendored file" --quiet - local output ok=true - output=$(run_script) || true + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when vendor path candidate found" || ok=false assert_eq "$(get_field "$output" "file")" "vendor/lib/Foo.java" "file path" || ok=false assert_contains "$(get_field "$output" "reasons")" "path suggests vendored code" "reasons" || ok=false [[ "$ok" == "true" ]] } +test_new_file_under_io_sentry_vendor_path() { + setup_branch + mkdir -p io/sentry/vendor + echo "public class VendorStub {}" > io/sentry/vendor/VendorStub.java + git add io/sentry/vendor/VendorStub.java + git commit -m "Add file under io/sentry/vendor" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when io/sentry/vendor path candidate found" || ok=false + assert_eq "$(get_field "$output" "file")" "io/sentry/vendor/VendorStub.java" "file path" || ok=false + assert_contains "$(get_field "$output" "reasons")" "path suggests vendored code" "reasons" || ok=false + [[ "$ok" == "true" ]] +} + test_deleted_file_with_attribution() { mkdir -p src cat > src/Licensed.java << 'JAVA' @@ -234,8 +268,9 @@ JAVA git add src/Strip.java git commit -m "Remove attribution" --quiet - local output ok=true - output=$(run_script) || true + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when candidates found" || ok=false assert_eq "$(get_field "$output" "status")" "M" "status" || ok=false assert_contains "$(get_field "$output" "reasons")" "attribution markers removed" "reasons" || ok=false [[ "$ok" == "true" ]] @@ -258,8 +293,9 @@ JAVA git mv src/old/Lib.java src/new/Lib.java git commit -m "Rename file" --quiet - local output ok=true - output=$(run_script) || true + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when candidates found" || ok=false assert_eq "$(get_field "$output" "file")" "src/new/Lib.java" "file should be new path" || ok=false assert_eq "$(get_field "$output" "status")" "R" "status" || ok=false assert_contains "$(get_field "$output" "reasons")" "renamed file with attribution markers" "reasons" || ok=false @@ -283,8 +319,9 @@ JAVA git add src/Staged.java # NOT committed - local output ok=true - output=$(run_script) || true + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when staged candidate found" || ok=false assert_contains "$output" "src/Staged.java" "should detect staged file" || ok=false assert_contains "$(get_field "$output" "reasons")" "attribution markers in file" "reasons" || ok=false [[ "$ok" == "true" ]] @@ -311,8 +348,9 @@ JAVA git add src/StagedMod.java # NOT committed — staged only - local output ok=true - output=$(run_script) || true + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when staged modification detected" || ok=false assert_contains "$output" "src/StagedMod.java" "should detect staged modification" || ok=false assert_contains "$(get_field "$output" "reasons")" "attribution markers added" "reasons" || ok=false [[ "$ok" == "true" ]] @@ -334,8 +372,9 @@ public class Untracked {} JAVA # NOT staged, NOT committed - local output ok=true - output=$(run_script) || true + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when untracked candidate found" || ok=false assert_contains "$output" "src/Untracked.java" "should detect untracked file" || ok=false [[ "$ok" == "true" ]] } @@ -355,11 +394,67 @@ JAVA git add src/Noticed.java THIRD_PARTY_NOTICES.md git commit -m "Add attributed file and update notices" --quiet - local output - output=$(run_script) || true - assert_contains "$output" "notices_file_changed: true" "global notices_file_changed" + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when candidates or NOTICES changed" || ok=false + assert_global_metadata_prefix "$output" true true "committed NOTICES + candidate" || ok=false + assert_eq "$(echo "$output" | sed -n '3p')" "---" "metadata precedes candidate block" || ok=false + [[ "$ok" == "true" ]] } +test_notices_only_change_triggers_without_file_candidates() { + setup_branch + printf '\n### Doc-only tweak\nPaths refreshed.\n' >> THIRD_PARTY_NOTICES.md + git add THIRD_PARTY_NOTICES.md + git commit -m "Doc-only NOTICES update" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "THIRD_PARTY_NOTICES.md change must exit 10" || ok=false + assert_global_metadata_prefix "$output" true true "NOTICES-only committed doc tweak" || ok=false + local count + count=$(echo "$output" | grep -c "^file: " || true) + assert_eq "$count" "0" "no file candidate blocks when diff avoids attribution keywords" || ok=false + assert_line_count "$output" 2 "NOTICES-only signal should be exactly two metadata lines" || ok=false + [[ "$ok" == "true" ]] +} + +test_notices_only_unstaged_triggers_without_file_candidates() { + setup_branch + echo "placeholder" > placeholder.txt + git add placeholder.txt + git commit -m "Placeholder" --quiet + + printf '\n### Unstaged doc tweak\nNo license keywords here.\n' >> THIRD_PARTY_NOTICES.md + # NOT staged — worktree-only change to NOTICES + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "unstaged NOTICES change must exit 10" || ok=false + assert_global_metadata_prefix "$output" true true "unstaged NOTICES only" || ok=false + local count + count=$(echo "$output" | grep -c "^file: " || true) + assert_eq "$count" "0" "no file blocks for NOTICES-only unstaged edit" || ok=false + assert_line_count "$output" 2 "stdout should be metadata only" || ok=false + [[ "$ok" == "true" ]] +} + +# NOTICES exists at merge-base (main) but is removed on the feature branch only. +test_notices_deleted_on_branch() { + setup_branch + git rm THIRD_PARTY_NOTICES.md --quiet + git commit -m "Remove THIRD_PARTY_NOTICES on branch" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "NOTICES deletion on branch must exit 10" || ok=false + assert_global_metadata_prefix "$output" false true "NOTICES deleted vs merge-base" || ok=false + local count + count=$(echo "$output" | grep -c "^file: " || true) + assert_eq "$count" "0" "no file candidates when only NOTICES is deleted" || ok=false + assert_line_count "$output" 2 "metadata only when NOTICES-only deletion" || ok=false + [[ "$ok" == "true" ]] +} test_excluded_paths_skipped() { setup_branch @@ -374,9 +469,11 @@ JAVA git add .github/workflows/Licensed.java git commit -m "Add file in excluded path" --quiet - local exit_code=0 - run_script > /dev/null 2>&1 || exit_code=$? - assert_eq "$exit_code" "0" "excluded paths should produce no candidates" + local output exit_code=0 + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "0" "excluded paths should produce no candidates" || return 1 + assert_global_metadata_prefix "$output" true false "excluded path" || return 1 + assert_line_count "$output" 2 "exit 0 should print metadata only" || return 1 } test_generated_files_skipped() { @@ -392,9 +489,11 @@ JAVA git add src/generated/com/example/R.java git commit -m "Add generated file" --quiet - local exit_code=0 - run_script > /dev/null 2>&1 || exit_code=$? - assert_eq "$exit_code" "0" "generated files should produce no candidates" + local output exit_code=0 + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "0" "generated files should produce no candidates" || return 1 + assert_global_metadata_prefix "$output" true false "generated file skipped" || return 1 + assert_line_count "$output" 2 "exit 0 should print metadata only" || return 1 } test_sentry_copyright_not_flagged() { @@ -408,9 +507,11 @@ JAVA git add src/SentryOwned.java git commit -m "Add Sentry-owned file" --quiet - local exit_code=0 - run_script > /dev/null 2>&1 || exit_code=$? - assert_eq "$exit_code" "0" "Sentry copyright should not trigger attribution" + local output exit_code=0 + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "0" "Sentry copyright should not trigger attribution" || return 1 + assert_global_metadata_prefix "$output" true false "Sentry-only copyright" || return 1 + assert_line_count "$output" 2 "exit 0 should print metadata only" || return 1 } @@ -432,9 +533,11 @@ JAVA git add THIRD_PARTY_NOTICES.md # NOT committed - local output - output=$(run_script) || true - assert_contains "$output" "notices_file_changed: true" "staged notices change should be detected" + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "staged NOTICES change must exit 10" || ok=false + assert_global_metadata_prefix "$output" true true "staged NOTICES with committed candidate" || ok=false + [[ "$ok" == "true" ]] } test_modified_vendor_path_file_attribution_changed() { @@ -466,8 +569,9 @@ JAVA git add vendor/lib/Vendored.java git commit -m "Update vendored file" --quiet - local output ok=true - output=$(run_script) || true + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when vendor-path attribution change detected" || ok=false assert_eq "$(get_field "$output" "status")" "M" "status" || ok=false assert_contains "$(get_field "$output" "reasons")" "attribution markers modified" "reasons should include attribution change" || ok=false assert_contains "$(get_field "$output" "reasons")" "file in vendor path" "reasons should include vendor path" || ok=false @@ -482,9 +586,11 @@ test_binary_file_skipped() { git add src/Binary.dat git commit -m "Add binary file" --quiet - local exit_code=0 - run_script > /dev/null 2>&1 || exit_code=$? - assert_eq "$exit_code" "0" "binary files should be skipped" + local output exit_code=0 + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "0" "binary files should be skipped" || return 1 + assert_global_metadata_prefix "$output" true false "binary skipped" || return 1 + assert_line_count "$output" 2 "exit 0 should print metadata only" || return 1 } # Many first-party Sentry files carry the project's own Apache 2.0 license header. @@ -504,9 +610,11 @@ JAVA git add src/LicenseOnly.java git commit -m "Add file with license-only header" --quiet - local exit_code=0 - run_script > /dev/null 2>&1 || exit_code=$? - assert_eq "$exit_code" "0" "license-only header without third-party copyright should not trigger attribution" + local output exit_code=0 + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "0" "license-only header without third-party copyright should not trigger attribution" || return 1 + assert_global_metadata_prefix "$output" true false "license-only header" || return 1 + assert_line_count "$output" 2 "exit 0 should print metadata only" || return 1 } test_modified_vendored_file_no_attribution_change() { @@ -538,9 +646,11 @@ JAVA git add src/Vendored.java git commit -m "Fix bug in vendored file" --quiet - local exit_code=0 - run_script > /dev/null 2>&1 || exit_code=$? - assert_eq "$exit_code" "0" "should not flag vendored file when only non-attribution lines changed" + local output exit_code=0 + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "0" "should not flag vendored file when only non-attribution lines changed" || return 1 + assert_global_metadata_prefix "$output" true false "vendored file non-attribution edit" || return 1 + assert_line_count "$output" 2 "exit 0 should print metadata only" || return 1 } # Adding Sentry's own license header to a first-party file should not trigger. @@ -562,9 +672,11 @@ JAVA git add src/FirstParty.java git commit -m "Add Sentry license header" --quiet - local exit_code=0 - run_script > /dev/null 2>&1 || exit_code=$? - assert_eq "$exit_code" "0" "adding Sentry's own license header should not trigger attribution" + local output exit_code=0 + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "0" "adding Sentry's own license header should not trigger attribution" || return 1 + assert_global_metadata_prefix "$output" true false "first-party license header" || return 1 + assert_line_count "$output" 2 "exit 0 should print metadata only" || return 1 } test_unstaged_modification_detected() { @@ -587,8 +699,9 @@ public class Unstaged {} JAVA # NOT staged, NOT committed — worktree only - local output ok=true - output=$(run_script) || true + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when unstaged modification detected" || ok=false assert_contains "$output" "src/Unstaged.java" "should detect unstaged modification" || ok=false assert_contains "$(get_field "$output" "reasons")" "attribution markers added" "reasons" || ok=false [[ "$ok" == "true" ]] @@ -644,16 +757,184 @@ JAVA local output exit_code=0 ok=true output=$(run_script) || exit_code=$? assert_eq "$exit_code" "10" "should still find candidates" || ok=false - assert_contains "$output" "notices_file_exists: false" "global notices_file_exists" || ok=false + assert_global_metadata_prefix "$output" false false "missing NOTICES with vendored candidate" || ok=false + assert_eq "$(echo "$output" | sed -n '3p')" "---" "metadata precedes candidate block" || ok=false + [[ "$ok" == "true" ]] +} + +test_merge_base_failure() { + git checkout --orphan no-main --quiet 2>/dev/null + echo "orphan" > orphan.txt + git add orphan.txt + git commit -m "Orphan commit" --quiet + git branch -D main --quiet 2>/dev/null || true + + local exit_code=0 output + output=$(run_script) || exit_code=$? + + local ok=true + assert_eq "$exit_code" "2" "should exit 2 when merge-base fails" || ok=false + assert_contains "$output" "could not determine merge-base" "error message" || ok=false + [[ "$ok" == "true" ]] +} + +test_renamed_into_vendor_path() { + mkdir -p src + echo "public class Moved {}" > src/Moved.java + git add src/Moved.java + git commit -m "Add file" --quiet + + setup_branch + mkdir -p vendor/lib + git mv src/Moved.java vendor/lib/Moved.java + git commit -m "Move to vendor path" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when renamed into vendor path" || ok=false + assert_eq "$(get_field "$output" "file")" "vendor/lib/Moved.java" "file should be new vendor path" || ok=false + assert_eq "$(get_field "$output" "status")" "R" "status" || ok=false + assert_contains "$(get_field "$output" "reasons")" "renamed file in vendor path" "reasons" || ok=false + [[ "$ok" == "true" ]] +} + +test_renamed_out_of_vendor_path() { + mkdir -p vendor/lib + cat > vendor/lib/Leaving.java << 'JAVA' +// Adapted from LeavingLib. +// Copyright 2024 Leaving Author. +// Licensed under the MIT License. +package com.example; +public class Leaving {} +JAVA + git add vendor/lib/Leaving.java + git commit -m "Add vendored file" --quiet + + setup_branch + mkdir -p src + git mv vendor/lib/Leaving.java src/Leaving.java + git commit -m "Move out of vendor path" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when renamed out of vendor path" || ok=false + assert_eq "$(get_field "$output" "file")" "src/Leaving.java" "file should be new path" || ok=false + assert_eq "$(get_field "$output" "status")" "R" "status" || ok=false + assert_contains "$(get_field "$output" "reasons")" "renamed file in vendor path" "reasons" || ok=false + [[ "$ok" == "true" ]] +} + +test_dual_copyright_sentry_and_third_party() { + setup_branch + mkdir -p src + cat > src/DualCopyright.java << 'JAVA' +// Copyright 2024 Functional Software, Inc. and Example Corp +// Licensed under the Apache License 2.0. +package com.example; +public class DualCopyright {} +JAVA + git add src/DualCopyright.java + git commit -m "Add dual-copyright file" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 for dual-copyright file" || ok=false + assert_eq "$(get_field "$output" "file")" "src/DualCopyright.java" "file path" || ok=false + assert_contains "$(get_field "$output" "reasons")" "attribution markers in file" "reasons" || ok=false [[ "$ok" == "true" ]] } +test_committed_modified_then_staged_delete() { + mkdir -p src + cat > src/Conflict.java << 'JAVA' +// Adapted from ConflictLib. +// Copyright 2024 Conflict Author. +// Licensed under the MIT License. +package com.example; +public class Conflict {} +JAVA + git add src/Conflict.java + git commit -m "Add attributed file" --quiet + + setup_branch + cat > src/Conflict.java << 'JAVA' +// Adapted from ConflictLib. +// Copyright 2025 Conflict Author. +// Licensed under the MIT License. +package com.example; +public class Conflict { void updated() {} } +JAVA + git add src/Conflict.java + git commit -m "Modify file" --quiet + + git rm src/Conflict.java --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when candidates found" || ok=false + assert_eq "$(get_field "$output" "file")" "src/Conflict.java" "file path" || ok=false + assert_eq "$(get_field "$output" "status")" "D" "staged delete should override committed modify" || ok=false + assert_contains "$(get_field "$output" "reasons")" "deleted file with attribution markers" "reasons" || ok=false + [[ "$ok" == "true" ]] +} + +test_notices_file_not_emitted_as_candidate() { + setup_branch + mkdir -p src + cat > src/Vendored.java << 'JAVA' +// Adapted from SomeLib. +// Copyright 2024 Some Author. +// Licensed under the Apache License 2.0. +package com.example; +public class Vendored {} +JAVA + echo "## New Entry" >> THIRD_PARTY_NOTICES.md + git add src/Vendored.java THIRD_PARTY_NOTICES.md + git commit -m "Add vendored file and update notices" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when candidates found" || ok=false + assert_global_metadata_prefix "$output" true true "NOTICES changed with candidate" || ok=false + local count + count=$(echo "$output" | grep -c "^file: " || true) + assert_eq "$count" "1" "THIRD_PARTY_NOTICES.md must not appear as a candidate" || ok=false + assert_eq "$(get_field "$output" "file")" "src/Vendored.java" "only the source file should be a candidate" || ok=false + [[ "$ok" == "true" ]] +} + +test_removed_sentry_copyright_not_flagged() { + mkdir -p src + cat > src/SentryFile.java << 'JAVA' +// Copyright 2025 Functional Software, Inc. +// Licensed under the Apache License, Version 2.0. +package com.example; +public class SentryFile {} +JAVA + git add src/SentryFile.java + git commit -m "Add file with Sentry copyright" --quiet + + setup_branch + cat > src/SentryFile.java << 'JAVA' +package com.example; +public class SentryFile {} +JAVA + git add src/SentryFile.java + git commit -m "Remove Sentry copyright" --quiet + + local output exit_code=0 + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "0" "removing Sentry-only copyright should not trigger attribution" || return 1 + assert_global_metadata_prefix "$output" true false "removed Sentry copyright" || return 1 + assert_line_count "$output" 2 "exit 0 should print metadata only" || return 1 +} # --- Run all tests --- run_test "Clean branch — no candidates" test_clean_branch_no_candidates run_test "New file with attribution markers" test_new_file_with_attribution run_test "New file in vendor path" test_new_file_in_vendor_path +run_test "New file under io/sentry/vendor" test_new_file_under_io_sentry_vendor_path run_test "Deleted file with attribution" test_deleted_file_with_attribution run_test "Modified file — attribution added" test_modified_file_attribution_added run_test "Modified file — attribution removed" test_modified_file_attribution_removed @@ -662,18 +943,28 @@ run_test "Staged new file detected" test_staged_new_file_detect run_test "Staged modification detected" test_staged_modification_detected run_test "Untracked file detected" test_untracked_file_detected run_test "THIRD_PARTY_NOTICES.md change — committed" test_notices_file_changed_true +run_test "THIRD_PARTY_NOTICES.md only — exit 10, no file blocks" test_notices_only_change_triggers_without_file_candidates +run_test "THIRD_PARTY_NOTICES.md only — unstaged, no file blocks" test_notices_only_unstaged_triggers_without_file_candidates +run_test "THIRD_PARTY_NOTICES.md deleted on branch" test_notices_deleted_on_branch run_test "THIRD_PARTY_NOTICES.md change — staged" test_notices_staged_change_detected +run_test "THIRD_PARTY_NOTICES file not emitted as candidate" test_notices_file_not_emitted_as_candidate +run_test "Missing THIRD_PARTY_NOTICES.md" test_missing_notices_file run_test "Excluded paths skipped" test_excluded_paths_skipped run_test "Generated files skipped" test_generated_files_skipped +run_test "Binary file skipped" test_binary_file_skipped run_test "Sentry copyright not flagged" test_sentry_copyright_not_flagged +run_test "Removed Sentry copyright — not flagged" test_removed_sentry_copyright_not_flagged run_test "License-only header not flagged" test_license_only_header_not_flagged -run_test "Binary file skipped" test_binary_file_skipped run_test "Modified vendor-path file — attribution changed" test_modified_vendor_path_file_attribution_changed run_test "Modified vendored file — no attribution change" test_modified_vendored_file_no_attribution_change run_test "Modified first-party file — Sentry license header not flagged" test_modified_first_party_license_header_not_flagged run_test "Unstaged modification detected" test_unstaged_modification_detected run_test "Multiple candidates in single run" test_multiple_candidates -run_test "Missing THIRD_PARTY_NOTICES.md" test_missing_notices_file +run_test "Merge-base failure" test_merge_base_failure +run_test "Renamed into vendor path" test_renamed_into_vendor_path +run_test "Renamed out of vendor path" test_renamed_out_of_vendor_path +run_test "Dual copyright — Sentry and third-party" test_dual_copyright_sentry_and_third_party +run_test "Committed M + staged D resolves to D" test_committed_modified_then_staged_delete # --- Summary --- From 3127d5b43dafc5f1d8150221210e9a6e18926c57 Mon Sep 17 00:00:00 2001 From: Adam Brown Date: Fri, 15 May 2026 09:54:55 +0200 Subject: [PATCH 10/10] Fix false positives from first-party copyright edits and detect attribution stripping during renames --- .../find-attribution-candidates.sh | 10 +++ .../test/test-find-attribution-candidates.sh | 83 +++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/.claude/skills/check-code-attribution/find-attribution-candidates.sh b/.claude/skills/check-code-attribution/find-attribution-candidates.sh index 00eddbf1d23..4dad6e6ddd5 100755 --- a/.claude/skills/check-code-attribution/find-attribution-candidates.sh +++ b/.claude/skills/check-code-attribution/find-attribution-candidates.sh @@ -269,6 +269,12 @@ while IFS=$'\t' read -r status filepath old_filepath; do if has_third_party_attribution "$WORK_DIR/content"; then is_candidate=true reasons+=("renamed file with attribution markers") + elif [[ "$is_candidate" == "false" ]]; then + git show "$MERGE_BASE":"${old_filepath:-$filepath}" > "$WORK_DIR/old_content" 2>/dev/null || true + if [[ -s "$WORK_DIR/old_content" ]] && has_third_party_attribution "$WORK_DIR/old_content"; then + is_candidate=true + reasons+=("attribution markers stripped during rename") + fi fi elif [[ "$status_char" == "M" ]]; then @@ -288,6 +294,10 @@ while IFS=$'\t' read -r status filepath old_filepath; do fi if [[ "$has_added" == "true" && "$has_removed" == "true" ]]; then + git show "$MERGE_BASE":"$filepath" > "$WORK_DIR/old_content" 2>/dev/null || continue + if ! has_third_party_attribution "$WORK_DIR/content" && ! has_third_party_attribution "$WORK_DIR/old_content" && ! has_vendor_path "$filepath"; then + continue + fi reasons+=("attribution markers modified") elif [[ "$has_added" == "true" ]]; then # The diff-hunk scan uses broad patterns (e.g., "copyright", "licensed") that diff --git a/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh b/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh index db3b3685c54..0e170d3fe30 100644 --- a/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh +++ b/.claude/skills/check-code-attribution/test/test-find-attribution-candidates.sh @@ -824,6 +824,87 @@ JAVA [[ "$ok" == "true" ]] } +test_renamed_with_attribution_stripped() { + mkdir -p src/old + # File must be large enough that removing the 3-line header stays above git's + # 50% rename-detection similarity threshold. + cat > src/old/Stripped.java << 'JAVA' +// Adapted from StrippedLib. +// Copyright 2024 Stripped Author. +// Licensed under the MIT License. +package com.example; +public class Stripped { + private int field1; + private int field2; + private int field3; + public void method1() { field1 = 1; } + public void method2() { field2 = 2; } + public void method3() { field3 = 3; } + public int getField1() { return field1; } + public int getField2() { return field2; } + public int getField3() { return field3; } +} +JAVA + git add src/old/Stripped.java + git commit -m "Add attributed file" --quiet + + setup_branch + mkdir -p src/new + git mv src/old/Stripped.java src/new/Stripped.java + cat > src/new/Stripped.java << 'JAVA' +package com.example; +public class Stripped { + private int field1; + private int field2; + private int field3; + public void method1() { field1 = 1; } + public void method2() { field2 = 2; } + public void method3() { field3 = 3; } + public int getField1() { return field1; } + public int getField2() { return field2; } + public int getField3() { return field3; } +} +JAVA + git add src/new/Stripped.java + git commit -m "Rename and strip attribution" --quiet + + local output exit_code=0 ok=true + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "10" "should exit 10 when attribution stripped during rename" || ok=false + assert_eq "$(get_field "$output" "file")" "src/new/Stripped.java" "file should be new path" || ok=false + assert_eq "$(get_field "$output" "status")" "R" "status" || ok=false + assert_contains "$(get_field "$output" "reasons")" "attribution markers stripped during rename" "reasons" || ok=false + [[ "$ok" == "true" ]] +} + +test_modified_sentry_copyright_year_bump_not_flagged() { + mkdir -p src + cat > src/SentryYear.java << 'JAVA' +// Copyright 2024 Functional Software, Inc. +// Licensed under the Apache License, Version 2.0. +package com.example; +public class SentryYear {} +JAVA + git add src/SentryYear.java + git commit -m "Add file with Sentry copyright" --quiet + + setup_branch + cat > src/SentryYear.java << 'JAVA' +// Copyright 2025 Functional Software, Inc. +// Licensed under the Apache License, Version 2.0. +package com.example; +public class SentryYear {} +JAVA + git add src/SentryYear.java + git commit -m "Bump copyright year" --quiet + + local output exit_code=0 + output=$(run_script) || exit_code=$? + assert_eq "$exit_code" "0" "Sentry copyright year bump should not trigger attribution" || return 1 + assert_global_metadata_prefix "$output" true false "Sentry copyright year bump" || return 1 + assert_line_count "$output" 2 "exit 0 should print metadata only" || return 1 +} + test_dual_copyright_sentry_and_third_party() { setup_branch mkdir -p src @@ -963,6 +1044,8 @@ run_test "Multiple candidates in single run" test_multiple_candidates run_test "Merge-base failure" test_merge_base_failure run_test "Renamed into vendor path" test_renamed_into_vendor_path run_test "Renamed out of vendor path" test_renamed_out_of_vendor_path +run_test "Renamed with attribution stripped" test_renamed_with_attribution_stripped +run_test "Modified Sentry copyright year — not flagged" test_modified_sentry_copyright_year_bump_not_flagged run_test "Dual copyright — Sentry and third-party" test_dual_copyright_sentry_and_third_party run_test "Committed M + staged D resolves to D" test_committed_modified_then_staged_delete