feat(pierre): add diff chunking and configurable review settings

This commit is contained in:
u80864958
2026-02-13 16:18:49 +01:00
parent 2cb64194b9
commit 985fac347c
7 changed files with 182 additions and 23 deletions

View File

@@ -37,7 +37,13 @@ type LLMConfig struct {
Model string `help:"Model to use" env:"LLM_MODEL"`
}
type ReviewConfig struct {
MaxChunkChars int `help:"Maximum diff chunk size in characters (default 60000)" default:"60000"`
Guidelines []string `help:"Project-specific review guidelines"`
}
type Config struct {
Review ReviewConfig `embed:"" prefix:"review-"`
GitProvider string `help:"Git provider (bitbucket or gitea)" env:"GIT_PROVIDER"`
Bitbucket BitbucketConfig `embed:"" prefix:"bitbucket-"`
Gitea GiteaConfig `embed:"" prefix:"gitea-"`
@@ -117,7 +123,7 @@ func main() {
log.Fatalf("Error initializing AI: %v", err)
}
pierreService := pierre.New(ai, git)
pierreService := pierre.New(ai, git, cfg.Review.MaxChunkChars, cfg.Review.Guidelines)
if err := pierreService.MakeReview(context.Background(), cfg.Repo.Owner, cfg.Repo.Repo, cfg.Repo.PRID); err != nil {
log.Fatalf("Error during review: %v", err)
}

3
go.mod
View File

@@ -7,7 +7,9 @@ require (
github.com/alecthomas/kong v1.14.0
github.com/alecthomas/kong-yaml v0.2.0
github.com/google/generative-ai-go v0.20.1
github.com/google/go-cmp v0.7.0
github.com/ollama/ollama v0.16.0
github.com/sashabaranov/go-openai v1.41.2
google.golang.org/api v0.186.0
)
@@ -34,7 +36,6 @@ require (
github.com/googleapis/gax-go/v2 v2.12.5 // indirect
github.com/hashicorp/go-version v1.7.0 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/sashabaranov/go-openai v1.41.2 // indirect
github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect
go.opencensus.io v0.24.0 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 // indirect

View File

@@ -31,7 +31,7 @@ func (b *BitbucketAdapter) GetDiff(ctx context.Context, projectKey, repositorySl
if response.StatusCode != http.StatusOK {
sb := &strings.Builder{}
io.Copy(sb, response.Body)
err = fmt.Errorf("error while fetching bitbucket diff staus %d, body %s", response.Status, sb.String())
err = fmt.Errorf("error while fetching bitbucket diff status %d, body %s", response.StatusCode, sb.String())
}
diff = response.Body

View File

@@ -4,6 +4,7 @@ import (
"context"
"fmt"
"io"
"strings"
"git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter"
)
@@ -19,21 +20,111 @@ func (s *Service) judgePR(ctx context.Context, diff io.Reader) (comments []Comme
if err != nil {
return nil, fmt.Errorf("failed to read diff: %w", err)
}
err = s.chat.GenerateStructured(ctx, []chatter.Message{
{
Role: chatter.RoleSystem,
Content: `
// Determine chunk size (use default if not set)
maxSize := s.maxChunkSize
if maxSize <= 0 {
maxSize = 60000 // default 60KB ~ 15k tokens
}
chunks := splitDiffIntoChunks(diffBytes, maxSize)
allComments := []Comment{}
// Build optional guidelines text
guidelinesText := ""
if len(s.guidelines) > 0 {
guidelinesText = "\nProject guidelines:\n"
for _, g := range s.guidelines {
guidelinesText += "- " + g + "\n"
}
}
baseSystem := `
You are a very strict senior software architect.
You review **only** newly added or modified lines in a unified diff, together with the immediate hunk context.
You do **not** report issues that appear **solely** in deleted lines (“-”) or that have already been fixed by the change.
No comments are made on pure formatting/whitespace changes or reordering that does not alter the programs behavior.
`,
},
{
Role: chatter.RoleUser,
Content: fmt.Sprintf("Hello please review my PR. Write comments where improvements are necessary in new lines.\n Here is the git diff of it: %s", string(diffBytes)),
},
}, &comments)
No comments are made on pure formatting/whitespace changes or reordering that does not alter the programs behavior.` + guidelinesText + `
`
for i, chunk := range chunks {
// Add a small header so the model knows this is a fragment
header := fmt.Sprintf("\n--- Chunk %d of %d ---\n", i+1, len(chunks))
userContent := fmt.Sprintf("Hello please review my PR. Write comments where improvements are necessary in new lines.%s\nHere is the git diff of it: %s", header, chunk)
err = s.chat.GenerateStructured(ctx, []chatter.Message{{
Role: chatter.RoleSystem,
Content: baseSystem,
}, {
Role: chatter.RoleUser,
Content: userContent,
}}, &comments)
if err != nil {
return nil, err
}
allComments = append(allComments, comments...)
}
// Deduplicate comments (keyed by file:line)
unique := make(map[string]Comment)
for _, c := range allComments {
key := fmt.Sprintf("%s:%d", c.File, c.Line)
unique[key] = c
}
for _, v := range unique {
comments = append(comments, v)
}
return
}
// splitDiffIntoChunks splits a diff into chunks that do not exceed maxSize bytes.
// It tries to split on file boundaries ("diff --git") first, then on hunk boundaries (@@),
// and finally on a hard byte limit.
func splitDiffIntoChunks(diff []byte, maxSize int) []string {
if len(diff) <= maxSize {
return []string{string(diff)}
}
content := string(diff)
// Split by file headers
parts := strings.Split(content, "\ndiff --git ")
chunks := []string{}
var current strings.Builder
for idx, part := range parts {
seg := part
if idx != 0 {
seg = "diff --git " + part
}
if current.Len()+len(seg) > maxSize && current.Len() > 0 {
chunks = append(chunks, current.String())
current.Reset()
}
if len(seg) > maxSize {
// Split further by hunks
hunks := strings.Split(seg, "\n@@ ")
for j, h := range hunks {
hseg := h
if j != 0 {
hseg = "@@ " + h
}
if current.Len()+len(hseg) > maxSize && current.Len() > 0 {
chunks = append(chunks, current.String())
current.Reset()
}
if len(hseg) > maxSize {
for len(hseg) > maxSize {
chunks = append(chunks, hseg[:maxSize])
hseg = hseg[maxSize:]
}
current.WriteString(hseg)
} else {
current.WriteString(hseg)
}
}
} else {
current.WriteString(seg)
}
}
if current.Len() > 0 {
chunks = append(chunks, current.String())
}
return chunks
}

View File

@@ -0,0 +1,57 @@
package pierre
import (
"github.com/google/go-cmp/cmp"
"strings"
"testing"
)
func TestSplitDiffIntoChunks_SmallDiff(t *testing.T) {
diff := "diff --git a/file1.txt b/file1.txt\n+added line\n"
max := 1000
chunks := splitDiffIntoChunks([]byte(diff), max)
if got, want := len(chunks), 1; got != want {
t.Fatalf("expected %d chunk, got %d", want, got)
}
if diff != chunks[0] {
t.Fatalf("chunk content changed: %s", diff)
}
}
func TestSplitDiffIntoChunks_MultipleFiles(t *testing.T) {
diff := "diff --git a/file1.txt b/file1.txt\n+added line 1\n" +
"diff --git a/file2.txt b/file2.txt\n+added line 2\n"
max := 50 // each file diff is less than this, but total exceeds
chunks := splitDiffIntoChunks([]byte(diff), max)
if got, want := len(chunks), 2; got != want {
t.Fatalf("expected %d chunks, got %d", want, got)
}
// Ensure each chunk starts with the proper file header
if !strings.HasPrefix(chunks[0], "diff --git a/file1.txt") {
t.Fatalf("first chunk does not contain file1 header: %s", chunks[0])
}
if !strings.HasPrefix(chunks[1], "diff --git a/file2.txt") {
t.Fatalf("second chunk does not contain file2 header: %s", chunks[1])
}
}
func TestSplitDiffIntoChunks_LargeSingleFile(t *testing.T) {
// Create a diff that exceeds maxSize by repeating a line
line := "+very long added line that will be repeated many times to exceed the chunk size\n"
diff := "diff --git a/large.txt b/large.txt\n" + strings.Repeat(line, 200)
max := 500 // small limit
chunks := splitDiffIntoChunks([]byte(diff), max)
// Ensure no chunk exceeds max size
for i, c := range chunks {
if len(c) > max {
t.Fatalf("chunk %d exceeds max size: %d > %d", i, len(c), max)
}
}
// Recombine chunks and compare to original (ignoring possible split boundaries)
recombined := strings.Join(chunks, "")
if diff != recombined {
if d := cmp.Diff(diff, recombined); d != "" {
t.Fatalf("recombined diff differs:\n%s", d)
}
}
}

View File

@@ -8,14 +8,18 @@ import (
)
type Service struct {
maxChunkSize int
guidelines []string
git GitAdapter
chat ChatAdapter
}
func New(chat ChatAdapter, git GitAdapter) *Service {
func New(chat ChatAdapter, git GitAdapter, maxChunkSize int, guidelines []string) *Service {
return &Service{
git: git,
chat: chat,
maxChunkSize: maxChunkSize,
guidelines: guidelines,
}
}