feat(pierre): add diff chunking and configurable review settings #2

Open
schreifuchs wants to merge 4 commits from feat/context-improvements into main
8 changed files with 293 additions and 27 deletions
Showing only changes of commit cc321be658 - Show all commits

View File

@@ -37,7 +37,19 @@ type LLMConfig struct {
Model string `help:"Model to use" env:"LLM_MODEL"` Model string `help:"Model to use" env:"LLM_MODEL"`
} }
// ReviewConfig holds the reviewspecific CLI options.
// The `default:"60000"` tag sets an integer default of 60KB Kong parses the string value into the int field, which can be confusing for readers.
type ReviewConfig struct {
MaxChunkSize int `help:"Maximum diff chunk size in bytes" default:"60000"`
Guidelines []string `help:"Project guidelines to prepend" sep:","`
DisableComments bool `help:"Disable posting comments (dry run)"`
}
type Config struct { type Config struct {
// Embedding ReviewConfig with a prefix changes flag names to `--review-…`.
// Existing configuration files using the old flag names will need to be updated.
// Consider keeping backwards compatibility if required.
Review ReviewConfig `embed:"" prefix:"review-"`
GitProvider string `help:"Git provider (bitbucket or gitea)" env:"GIT_PROVIDER"` GitProvider string `help:"Git provider (bitbucket or gitea)" env:"GIT_PROVIDER"`
Bitbucket BitbucketConfig `embed:"" prefix:"bitbucket-"` Bitbucket BitbucketConfig `embed:"" prefix:"bitbucket-"`
Gitea GiteaConfig `embed:"" prefix:"gitea-"` Gitea GiteaConfig `embed:"" prefix:"gitea-"`
@@ -117,7 +129,7 @@ func main() {
log.Fatalf("Error initializing AI: %v", err) log.Fatalf("Error initializing AI: %v", err)
} }
pierreService := pierre.New(ai, git) pierreService := pierre.New(ai, git, cfg.Review.MaxChunkSize, cfg.Review.Guidelines, cfg.Review.DisableComments)
if err := pierreService.MakeReview(context.Background(), cfg.Repo.Owner, cfg.Repo.Repo, cfg.Repo.PRID); err != nil { if err := pierreService.MakeReview(context.Background(), cfg.Repo.Owner, cfg.Repo.Repo, cfg.Repo.PRID); err != nil {
log.Fatalf("Error during review: %v", err) log.Fatalf("Error during review: %v", err)
} }

3
go.mod
View File

@@ -7,7 +7,9 @@ require (
github.com/alecthomas/kong v1.14.0 github.com/alecthomas/kong v1.14.0
github.com/alecthomas/kong-yaml v0.2.0 github.com/alecthomas/kong-yaml v0.2.0
github.com/google/generative-ai-go v0.20.1 github.com/google/generative-ai-go v0.20.1
schreifuchs marked this conversation as resolved
Review

You added github.com/google/go-cmp v0.7.0 as a direct dependency – good for the new tests. Remember to run go mod tidy so that the go.sum is updated accordingly. (Generated by: OpenAI (openai/gpt-oss-120b))

You added `github.com/google/go-cmp v0.7.0` as a direct dependency – good for the new tests. Remember to run `go mod tidy` so that the `go.sum` is updated accordingly. (Generated by: OpenAI (openai/gpt-oss-120b))
github.com/google/go-cmp v0.7.0
github.com/ollama/ollama v0.16.0 github.com/ollama/ollama v0.16.0
github.com/sashabaranov/go-openai v1.41.2
google.golang.org/api v0.186.0 google.golang.org/api v0.186.0
) )
@@ -34,7 +36,6 @@ require (
github.com/googleapis/gax-go/v2 v2.12.5 // indirect github.com/googleapis/gax-go/v2 v2.12.5 // indirect
github.com/hashicorp/go-version v1.7.0 // indirect github.com/hashicorp/go-version v1.7.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect github.com/mailru/easyjson v0.7.7 // indirect
github.com/sashabaranov/go-openai v1.41.2 // indirect
github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect
go.opencensus.io v0.24.0 // indirect go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 // indirect

View File

@@ -31,7 +31,7 @@ func (b *BitbucketAdapter) GetDiff(ctx context.Context, projectKey, repositorySl
if response.StatusCode != http.StatusOK { if response.StatusCode != http.StatusOK {
sb := &strings.Builder{} sb := &strings.Builder{}
io.Copy(sb, response.Body) io.Copy(sb, response.Body)
err = fmt.Errorf("error while fetching bitbucket diff staus %d, body %s", response.Status, sb.String()) err = fmt.Errorf("error while fetching bitbucket diff status %d, body %s", response.StatusCode, sb.String())
} }
diff = response.Body diff = response.Body

View File

@@ -4,10 +4,14 @@ import (
"context" "context"
"fmt" "fmt"
"io" "io"
"strings"
"git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter" "git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter"
) )
// DefaultChunkSize is the fallback maximum size (in bytes) for a diff chunk when no explicit value is configured.
const DefaultChunkSize = 60000
type Comment struct { type Comment struct {
File string `json:"file"` File string `json:"file"`
Line int `json:"line"` Line int `json:"line"`
@@ -19,21 +23,121 @@ func (s *Service) judgePR(ctx context.Context, diff io.Reader) (comments []Comme
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to read diff: %w", err) return nil, fmt.Errorf("failed to read diff: %w", err)
} }
err = s.chat.GenerateStructured(ctx, []chatter.Message{
{
Role: chatter.RoleSystem,
Content: `
You are a very strict senior software architect.
You review **only** newly added or modified lines in a unified diff, together with the immediate hunk context.
You do **not** report issues that appear **solely** in deleted lines (“-”) or that have already been fixed by the change.
No comments are made on pure formatting/whitespace changes or reordering that does not alter the programs behavior.
`,
},
{
Role: chatter.RoleUser,
Content: fmt.Sprintf("Hello please review my PR. Write comments where improvements are necessary in new lines.\n Here is the git diff of it: %s", string(diffBytes)),
},
}, &comments)
// Determine chunk size (use default if not set)
maxSize := s.maxChunkSize
if maxSize <= 0 {
maxSize = DefaultChunkSize // default 60KB ~ 15k tokens
}
chunks := splitDiffIntoChunks(diffBytes, maxSize)
allComments := []Comment{}
// Build optional guidelines text (added as a separate section with a clear delimiter)
guidelinesText := ""
if len(s.guidelines) > 0 {
// Two newlines ensure the guidelines start on a fresh paragraph.
guidelinesText = "\n\nProject guidelines:\n"
for _, g := range s.guidelines {
guidelinesText += "- " + g + "\n"
}
}
// System prompt that instructs the LLM precisely.
baseSystem := strings.TrimSpace(`
You are a strict senior software architect.
Only comment on newly added or modified lines in the diff; ignore deletions, pure formatting, or reordering that does not change behavior.
For each issue output a JSON object with fields "file", "line", and "message" (message should be concise, ≤2 sentences, and actionable).
If project guidelines are provided, treat them as hard rules that must be respected.`) + guidelinesText
for i, chunk := range chunks {
// Include the chunk identifier in the system message only if there are multiple chunks.
systemContent := baseSystem
if len(chunks) > 1 {
systemContent = fmt.Sprintf("%s\nChunk %d of %d.", baseSystem, i+1, len(chunks))
}
userContent := chunk
var chunkComments []Comment
err = s.chat.GenerateStructured(ctx, []chatter.Message{{
Role: chatter.RoleSystem,
Content: systemContent,
}, {
Role: chatter.RoleUser,
Content: userContent,
}}, &chunkComments)
if err != nil {
return nil, err
}
allComments = append(allComments, chunkComments...)
}
// Deduplicate comments (keyed by file:line)
unique := make(map[string]Comment)
for _, c := range allComments {
key := fmt.Sprintf("%s:%d", c.File, c.Line)
unique[key] = c
}
for _, v := range unique {
comments = append(comments, v)
}
return return
} }
// splitDiffIntoChunks splits a diff into chunks that do not exceed maxSize bytes.
// It tries to split on file boundaries ("diff --git") first, then on hunk boundaries (@@),
// and finally on a hard byte limit.
func splitDiffIntoChunks(diff []byte, maxSize int) []string {
if len(diff) <= maxSize {
return []string{string(diff)}
}
content := string(diff)
// Split by file headers
parts := strings.Split(content, "\ndiff --git ")
chunks := []string{}
var current strings.Builder
for idx, part := range parts {
seg := part
if idx != 0 {
// Preserve the leading newline that was removed by Split
seg = "\n" + "diff --git " + part
}
if current.Len()+len(seg) > maxSize && current.Len() > 0 {
chunks = append(chunks, current.String())
current.Reset()
}
if len(seg) > maxSize {
// Split further by hunks
hunks := strings.Split(seg, "\n@@ ")
for j, h := range hunks {
var hseg string
if j == 0 {
// First hunk segment already contains the preceding content (including any needed newline)
hseg = h
} else {
// Subsequent hunks need the leading newline and "@@ " marker restored
hseg = "\n@@ " + h
}
if current.Len()+len(hseg) > maxSize && current.Len() > 0 {
chunks = append(chunks, current.String())
current.Reset()
}
if len(hseg) > maxSize {
for len(hseg) > maxSize {
chunks = append(chunks, hseg[:maxSize])
hseg = hseg[maxSize:]
}
current.WriteString(hseg)
} else {
current.WriteString(hseg)
}
}
} else {
current.WriteString(seg)
}
}
if current.Len() > 0 {
chunks = append(chunks, current.String())
}
return chunks
}

View File

@@ -0,0 +1,132 @@
package pierre
import (
"bytes"
"context"
"io"
"strings"
"testing"
"git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter"
"github.com/google/go-cmp/cmp"
)
// mockChat implements the ChatAdapter interface for testing.
type mockChat struct{ callCount int }
func (m *mockChat) GenerateStructured(ctx context.Context, msgs []chatter.Message, target interface{}) error {
m.callCount++
if cSlice, ok := target.(*[]Comment); ok {
*cSlice = []Comment{{File: "file.go", Line: 1, Message: "test comment"}}
return nil
}
return nil
}
func (m *mockChat) GetProviderName() string { return "mock" }
// mockGit implements the GitAdapter interface for testing.
type mockGit struct{}
func (g *mockGit) GetDiff(ctx context.Context, owner, repo string, prID int) (io.ReadCloser, error) {
diff := "diff --git a/file1.go b/file1.go\n+line1\n" + "diff --git a/file2.go b/file2.go\n+line2\n"
return io.NopCloser(bytes.NewReader([]byte(diff))), nil
}
func (g *mockGit) AddComment(ctx context.Context, owner, repo string, prID int, comment Comment) error {
return nil
}
func TestSplitDiffIntoChunks(t *testing.T) {
cases := []struct {
name string
diff string
maxSize int
wantChunks int // 0 means we don't assert exact count
wantPrefixes []string
checkRecombine bool
}{
{
name: "small diff",
diff: "diff --git a/file1.txt b/file1.txt\n+added line\n",
maxSize: 1000,
wantChunks: 1,
wantPrefixes: []string{"diff --git a/file1.txt"},
checkRecombine: true,
},
{
name: "multiple files",
diff: "diff --git a/file1.txt b/file1.txt\n+added line 1\n" +
"diff --git a/file2.txt b/file2.txt\n+added line 2\n",
maxSize: 50,
wantChunks: 2,
wantPrefixes: []string{"diff --git a/file1.txt", "diff --git a/file2.txt"},
checkRecombine: false,
},
{
name: "large single file",
diff: func() string {
line := "+very long added line that will be repeated many times to exceed the chunk size\n"
return "diff --git a/large.txt b/large.txt\n" + strings.Repeat(line, 200)
}(),
maxSize: 500,
wantChunks: 0,
wantPrefixes: nil,
checkRecombine: true,
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
chunks := splitDiffIntoChunks([]byte(tc.diff), tc.maxSize)
if tc.wantChunks > 0 && len(chunks) != tc.wantChunks {
t.Fatalf("expected %d chunks, got %d", tc.wantChunks, len(chunks))
}
for i, prefix := range tc.wantPrefixes {
if i >= len(chunks) {
t.Fatalf("missing chunk %d for prefix check", i)
}
trimmed := strings.TrimPrefix(chunks[i], "\n")
if !strings.HasPrefix(trimmed, prefix) {
t.Fatalf("chunk %d does not start with expected prefix %q: %s", i, prefix, chunks[i])
}
}
for i, c := range chunks {
if tc.maxSize > 0 && len(c) > tc.maxSize {
t.Fatalf("chunk %d exceeds max size %d: %d", i, tc.maxSize, len(c))
}
}
if tc.checkRecombine {
recombined := strings.Join(chunks, "")
if diff := cmp.Diff(tc.diff, recombined); diff != "" {
t.Fatalf("recombined diff differs:\n%s", diff)
}
}
})
}
}
func TestJudgePR_ChunkAggregationAndDeduplication(t *testing.T) {
chatMock := &mockChat{}
svc := &Service{
maxChunkSize: 50,
guidelines: nil,
git: &mockGit{},
chat: chatMock,
}
diffReader, err := svc.git.GetDiff(context.Background(), "", "", 0)
if err != nil {
t.Fatalf("failed to get diff: %v", err)
}
defer diffReader.Close()
comments, err := svc.judgePR(context.Background(), diffReader)
if err != nil {
t.Fatalf("judgePR error: %v", err)
}
if got, want := len(comments), 1; got != want {
t.Fatalf("expected %d comment after deduplication, got %d", want, got)
}
if chatMock.callCount != 2 {
t.Fatalf("expected mockChat to be called for each chunk (2), got %d", chatMock.callCount)
}
}

View File

@@ -7,15 +7,26 @@ import (
"git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter" "git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter"
) )
// Service holds the core collaborators and configuration for Pierre.
// The order of the fields is intentional: configuration fields first (used
// during initialization) followed by the adapters. This prevents accidental
// changes to the serialized layout if encoding/gob or encoding/json is used
// elsewhere in the future.
type Service struct { type Service struct {
git GitAdapter maxChunkSize int
chat ChatAdapter guidelines []string
disableComments bool
git GitAdapter
chat ChatAdapter
} }
func New(chat ChatAdapter, git GitAdapter) *Service { func New(chat ChatAdapter, git GitAdapter, maxChunkSize int, guidelines []string, disableComments bool) *Service {
return &Service{ return &Service{
git: git, git: git,
chat: chat, chat: chat,
maxChunkSize: maxChunkSize,
guidelines: guidelines,
disableComments: disableComments,
} }
} }

View File

@@ -26,11 +26,17 @@ func (s *Service) MakeReview(ctx context.Context, organisation string, repo stri
for _, c := range comments { for _, c := range comments {
c.Message = fmt.Sprintf("%s (Generated by: %s)", c.Message, model) c.Message = fmt.Sprintf("%s (Generated by: %s)", c.Message, model)
fmt.Printf("File: %s\nLine: %d\nMessage: %s\n%s\n",
c.File, c.Line, c.Message, "---")
if err := s.git.AddComment(ctx, organisation, repo, prID, c); err != nil { if s.disableComments {
log.Printf("Failed to add comment: %v", err) // Dryrun: just log what would have been posted.
log.Printf("dryrun: %s:%d => %s", c.File, c.Line, c.Message)
} else {
schreifuchs marked this conversation as resolved
Review

Update the comment above the block to reflect that comments are posted only when not in dry‑run mode. (Reason: The comment correctly identifies that the existing comment is misleading—the code posts comments only when disableComments (dry‑run) is true, so the comment should be updated to reflect posting occurs when not in dry‑run mode.) (Generated by: OpenAI (openai/gpt-oss-120b))

Update the comment above the block to reflect that comments are posted only when not in dry‑run mode. (Reason: The comment correctly identifies that the existing comment is misleading—the code posts comments only when `disableComments` (dry‑run) is true, so the comment should be updated to reflect posting occurs when not in dry‑run mode.) (Generated by: OpenAI (openai/gpt-oss-120b))
// Normal mode: print to stdout and post the comment to the VCS.
fmt.Printf("File: %s\nLine: %d\nMessage: %s\n%s\n",
c.File, c.Line, c.Message, "---")
if err := s.git.AddComment(ctx, organisation, repo, prID, c); err != nil {
log.Printf("Failed to add comment: %v", err)
}
} }
} }