feat(pierre): add diff chunking and configurable review settings

This commit is contained in:
u80864958
2026-02-13 16:18:49 +01:00
parent 2cb64194b9
commit cc321be658
8 changed files with 293 additions and 27 deletions

View File

@@ -37,7 +37,19 @@ type LLMConfig struct {
Model string `help:"Model to use" env:"LLM_MODEL"` Model string `help:"Model to use" env:"LLM_MODEL"`
} }
// ReviewConfig holds the reviewspecific CLI options.
// The `default:"60000"` tag sets an integer default of 60KB Kong parses the string value into the int field, which can be confusing for readers.
type ReviewConfig struct {
MaxChunkSize int `help:"Maximum diff chunk size in bytes" default:"60000"`
Guidelines []string `help:"Project guidelines to prepend" sep:","`
DisableComments bool `help:"Disable posting comments (dry run)"`
}
type Config struct { type Config struct {
// Embedding ReviewConfig with a prefix changes flag names to `--review-…`.
// Existing configuration files using the old flag names will need to be updated.
// Consider keeping backwards compatibility if required.
Review ReviewConfig `embed:"" prefix:"review-"`
GitProvider string `help:"Git provider (bitbucket or gitea)" env:"GIT_PROVIDER"` GitProvider string `help:"Git provider (bitbucket or gitea)" env:"GIT_PROVIDER"`
Bitbucket BitbucketConfig `embed:"" prefix:"bitbucket-"` Bitbucket BitbucketConfig `embed:"" prefix:"bitbucket-"`
Gitea GiteaConfig `embed:"" prefix:"gitea-"` Gitea GiteaConfig `embed:"" prefix:"gitea-"`
@@ -117,7 +129,7 @@ func main() {
log.Fatalf("Error initializing AI: %v", err) log.Fatalf("Error initializing AI: %v", err)
} }
pierreService := pierre.New(ai, git) pierreService := pierre.New(ai, git, cfg.Review.MaxChunkSize, cfg.Review.Guidelines, cfg.Review.DisableComments)
if err := pierreService.MakeReview(context.Background(), cfg.Repo.Owner, cfg.Repo.Repo, cfg.Repo.PRID); err != nil { if err := pierreService.MakeReview(context.Background(), cfg.Repo.Owner, cfg.Repo.Repo, cfg.Repo.PRID); err != nil {
log.Fatalf("Error during review: %v", err) log.Fatalf("Error during review: %v", err)
} }

3
go.mod
View File

@@ -7,7 +7,9 @@ require (
github.com/alecthomas/kong v1.14.0 github.com/alecthomas/kong v1.14.0
github.com/alecthomas/kong-yaml v0.2.0 github.com/alecthomas/kong-yaml v0.2.0
github.com/google/generative-ai-go v0.20.1 github.com/google/generative-ai-go v0.20.1
github.com/google/go-cmp v0.7.0
github.com/ollama/ollama v0.16.0 github.com/ollama/ollama v0.16.0
github.com/sashabaranov/go-openai v1.41.2
google.golang.org/api v0.186.0 google.golang.org/api v0.186.0
) )
@@ -34,7 +36,6 @@ require (
github.com/googleapis/gax-go/v2 v2.12.5 // indirect github.com/googleapis/gax-go/v2 v2.12.5 // indirect
github.com/hashicorp/go-version v1.7.0 // indirect github.com/hashicorp/go-version v1.7.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect github.com/mailru/easyjson v0.7.7 // indirect
github.com/sashabaranov/go-openai v1.41.2 // indirect
github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect
go.opencensus.io v0.24.0 // indirect go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 // indirect

View File

@@ -31,7 +31,7 @@ func (b *BitbucketAdapter) GetDiff(ctx context.Context, projectKey, repositorySl
if response.StatusCode != http.StatusOK { if response.StatusCode != http.StatusOK {
sb := &strings.Builder{} sb := &strings.Builder{}
io.Copy(sb, response.Body) io.Copy(sb, response.Body)
err = fmt.Errorf("error while fetching bitbucket diff staus %d, body %s", response.Status, sb.String()) err = fmt.Errorf("error while fetching bitbucket diff status %d, body %s", response.StatusCode, sb.String())
} }
diff = response.Body diff = response.Body

View File

@@ -4,10 +4,14 @@ import (
"context" "context"
"fmt" "fmt"
"io" "io"
"strings"
"git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter" "git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter"
) )
// DefaultChunkSize is the fallback maximum size (in bytes) for a diff chunk when no explicit value is configured.
const DefaultChunkSize = 60000
type Comment struct { type Comment struct {
File string `json:"file"` File string `json:"file"`
Line int `json:"line"` Line int `json:"line"`
@@ -19,21 +23,121 @@ func (s *Service) judgePR(ctx context.Context, diff io.Reader) (comments []Comme
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to read diff: %w", err) return nil, fmt.Errorf("failed to read diff: %w", err)
} }
err = s.chat.GenerateStructured(ctx, []chatter.Message{
{
Role: chatter.RoleSystem,
Content: `
You are a very strict senior software architect.
You review **only** newly added or modified lines in a unified diff, together with the immediate hunk context.
You do **not** report issues that appear **solely** in deleted lines (“-”) or that have already been fixed by the change.
No comments are made on pure formatting/whitespace changes or reordering that does not alter the programs behavior.
`,
},
{
Role: chatter.RoleUser,
Content: fmt.Sprintf("Hello please review my PR. Write comments where improvements are necessary in new lines.\n Here is the git diff of it: %s", string(diffBytes)),
},
}, &comments)
// Determine chunk size (use default if not set)
maxSize := s.maxChunkSize
if maxSize <= 0 {
maxSize = DefaultChunkSize // default 60KB ~ 15k tokens
}
chunks := splitDiffIntoChunks(diffBytes, maxSize)
allComments := []Comment{}
// Build optional guidelines text (added as a separate section with a clear delimiter)
guidelinesText := ""
if len(s.guidelines) > 0 {
// Two newlines ensure the guidelines start on a fresh paragraph.
guidelinesText = "\n\nProject guidelines:\n"
for _, g := range s.guidelines {
guidelinesText += "- " + g + "\n"
}
}
// System prompt that instructs the LLM precisely.
baseSystem := strings.TrimSpace(`
You are a strict senior software architect.
Only comment on newly added or modified lines in the diff; ignore deletions, pure formatting, or reordering that does not change behavior.
For each issue output a JSON object with fields "file", "line", and "message" (message should be concise, ≤2 sentences, and actionable).
If project guidelines are provided, treat them as hard rules that must be respected.`) + guidelinesText
for i, chunk := range chunks {
// Include the chunk identifier in the system message only if there are multiple chunks.
systemContent := baseSystem
if len(chunks) > 1 {
systemContent = fmt.Sprintf("%s\nChunk %d of %d.", baseSystem, i+1, len(chunks))
}
userContent := chunk
var chunkComments []Comment
err = s.chat.GenerateStructured(ctx, []chatter.Message{{
Role: chatter.RoleSystem,
Content: systemContent,
}, {
Role: chatter.RoleUser,
Content: userContent,
}}, &chunkComments)
if err != nil {
return nil, err
}
allComments = append(allComments, chunkComments...)
}
// Deduplicate comments (keyed by file:line)
unique := make(map[string]Comment)
for _, c := range allComments {
key := fmt.Sprintf("%s:%d", c.File, c.Line)
unique[key] = c
}
for _, v := range unique {
comments = append(comments, v)
}
return return
} }
// splitDiffIntoChunks splits a diff into chunks that do not exceed maxSize bytes.
// It tries to split on file boundaries ("diff --git") first, then on hunk boundaries (@@),
// and finally on a hard byte limit.
func splitDiffIntoChunks(diff []byte, maxSize int) []string {
if len(diff) <= maxSize {
return []string{string(diff)}
}
content := string(diff)
// Split by file headers
parts := strings.Split(content, "\ndiff --git ")
chunks := []string{}
var current strings.Builder
for idx, part := range parts {
seg := part
if idx != 0 {
// Preserve the leading newline that was removed by Split
seg = "\n" + "diff --git " + part
}
if current.Len()+len(seg) > maxSize && current.Len() > 0 {
chunks = append(chunks, current.String())
current.Reset()
}
if len(seg) > maxSize {
// Split further by hunks
hunks := strings.Split(seg, "\n@@ ")
for j, h := range hunks {
var hseg string
if j == 0 {
// First hunk segment already contains the preceding content (including any needed newline)
hseg = h
} else {
// Subsequent hunks need the leading newline and "@@ " marker restored
hseg = "\n@@ " + h
}
if current.Len()+len(hseg) > maxSize && current.Len() > 0 {
chunks = append(chunks, current.String())
current.Reset()
}
if len(hseg) > maxSize {
for len(hseg) > maxSize {
chunks = append(chunks, hseg[:maxSize])
hseg = hseg[maxSize:]
}
current.WriteString(hseg)
} else {
current.WriteString(hseg)
}
}
} else {
current.WriteString(seg)
}
}
if current.Len() > 0 {
chunks = append(chunks, current.String())
}
return chunks
}

View File

@@ -0,0 +1,132 @@
package pierre
import (
"bytes"
"context"
"io"
"strings"
"testing"
"git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter"
"github.com/google/go-cmp/cmp"
)
// mockChat implements the ChatAdapter interface for testing.
type mockChat struct{ callCount int }
func (m *mockChat) GenerateStructured(ctx context.Context, msgs []chatter.Message, target interface{}) error {
m.callCount++
if cSlice, ok := target.(*[]Comment); ok {
*cSlice = []Comment{{File: "file.go", Line: 1, Message: "test comment"}}
return nil
}
return nil
}
func (m *mockChat) GetProviderName() string { return "mock" }
// mockGit implements the GitAdapter interface for testing.
type mockGit struct{}
func (g *mockGit) GetDiff(ctx context.Context, owner, repo string, prID int) (io.ReadCloser, error) {
diff := "diff --git a/file1.go b/file1.go\n+line1\n" + "diff --git a/file2.go b/file2.go\n+line2\n"
return io.NopCloser(bytes.NewReader([]byte(diff))), nil
}
func (g *mockGit) AddComment(ctx context.Context, owner, repo string, prID int, comment Comment) error {
return nil
}
func TestSplitDiffIntoChunks(t *testing.T) {
cases := []struct {
name string
diff string
maxSize int
wantChunks int // 0 means we don't assert exact count
wantPrefixes []string
checkRecombine bool
}{
{
name: "small diff",
diff: "diff --git a/file1.txt b/file1.txt\n+added line\n",
maxSize: 1000,
wantChunks: 1,
wantPrefixes: []string{"diff --git a/file1.txt"},
checkRecombine: true,
},
{
name: "multiple files",
diff: "diff --git a/file1.txt b/file1.txt\n+added line 1\n" +
"diff --git a/file2.txt b/file2.txt\n+added line 2\n",
maxSize: 50,
wantChunks: 2,
wantPrefixes: []string{"diff --git a/file1.txt", "diff --git a/file2.txt"},
checkRecombine: false,
},
{
name: "large single file",
diff: func() string {
line := "+very long added line that will be repeated many times to exceed the chunk size\n"
return "diff --git a/large.txt b/large.txt\n" + strings.Repeat(line, 200)
}(),
maxSize: 500,
wantChunks: 0,
wantPrefixes: nil,
checkRecombine: true,
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
chunks := splitDiffIntoChunks([]byte(tc.diff), tc.maxSize)
if tc.wantChunks > 0 && len(chunks) != tc.wantChunks {
t.Fatalf("expected %d chunks, got %d", tc.wantChunks, len(chunks))
}
for i, prefix := range tc.wantPrefixes {
if i >= len(chunks) {
t.Fatalf("missing chunk %d for prefix check", i)
}
trimmed := strings.TrimPrefix(chunks[i], "\n")
if !strings.HasPrefix(trimmed, prefix) {
t.Fatalf("chunk %d does not start with expected prefix %q: %s", i, prefix, chunks[i])
}
}
for i, c := range chunks {
if tc.maxSize > 0 && len(c) > tc.maxSize {
t.Fatalf("chunk %d exceeds max size %d: %d", i, tc.maxSize, len(c))
}
}
if tc.checkRecombine {
recombined := strings.Join(chunks, "")
if diff := cmp.Diff(tc.diff, recombined); diff != "" {
t.Fatalf("recombined diff differs:\n%s", diff)
}
}
})
}
}
func TestJudgePR_ChunkAggregationAndDeduplication(t *testing.T) {
chatMock := &mockChat{}
svc := &Service{
maxChunkSize: 50,
guidelines: nil,
git: &mockGit{},
chat: chatMock,
}
diffReader, err := svc.git.GetDiff(context.Background(), "", "", 0)
if err != nil {
t.Fatalf("failed to get diff: %v", err)
}
defer diffReader.Close()
comments, err := svc.judgePR(context.Background(), diffReader)
if err != nil {
t.Fatalf("judgePR error: %v", err)
}
if got, want := len(comments), 1; got != want {
t.Fatalf("expected %d comment after deduplication, got %d", want, got)
}
if chatMock.callCount != 2 {
t.Fatalf("expected mockChat to be called for each chunk (2), got %d", chatMock.callCount)
}
}

View File

@@ -7,15 +7,26 @@ import (
"git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter" "git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter"
) )
// Service holds the core collaborators and configuration for Pierre.
// The order of the fields is intentional: configuration fields first (used
// during initialization) followed by the adapters. This prevents accidental
// changes to the serialized layout if encoding/gob or encoding/json is used
// elsewhere in the future.
type Service struct { type Service struct {
maxChunkSize int
guidelines []string
disableComments bool
git GitAdapter git GitAdapter
chat ChatAdapter chat ChatAdapter
} }
func New(chat ChatAdapter, git GitAdapter) *Service { func New(chat ChatAdapter, git GitAdapter, maxChunkSize int, guidelines []string, disableComments bool) *Service {
return &Service{ return &Service{
git: git, git: git,
chat: chat, chat: chat,
maxChunkSize: maxChunkSize,
guidelines: guidelines,
disableComments: disableComments,
} }
} }

View File

@@ -26,13 +26,19 @@ func (s *Service) MakeReview(ctx context.Context, organisation string, repo stri
for _, c := range comments { for _, c := range comments {
c.Message = fmt.Sprintf("%s (Generated by: %s)", c.Message, model) c.Message = fmt.Sprintf("%s (Generated by: %s)", c.Message, model)
if s.disableComments {
// Dryrun: just log what would have been posted.
log.Printf("dryrun: %s:%d => %s", c.File, c.Line, c.Message)
} else {
// Normal mode: print to stdout and post the comment to the VCS.
fmt.Printf("File: %s\nLine: %d\nMessage: %s\n%s\n", fmt.Printf("File: %s\nLine: %d\nMessage: %s\n%s\n",
c.File, c.Line, c.Message, "---") c.File, c.Line, c.Message, "---")
if err := s.git.AddComment(ctx, organisation, repo, prID, c); err != nil { if err := s.git.AddComment(ctx, organisation, repo, prID, c); err != nil {
log.Printf("Failed to add comment: %v", err) log.Printf("Failed to add comment: %v", err)
} }
} }
}
return nil return nil
} }