2026-02-13 16:28:15 +01:00 · 2026-02-13 16:28:13 +01:00 · 2026-02-13 16:28:14 +01:00 · 2026-02-13 16:28:13 +01:00 · 2026-02-13 16:28:14 +01:00 · 2026-02-13 16:28:13 +01:00
8 changed files with 293 additions and 27 deletions
@@ -37,7 +37,19 @@ type LLMConfig struct {
 	Model    string `help:"Model to use" env:"LLM_MODEL"`
 }

+// ReviewConfig holds the review‑specific CLI options.
+// The `default:"60000"` tag sets an integer default of 60 KB – Kong parses the string value into the int field, which can be confusing for readers.
+type ReviewConfig struct {
+	MaxChunkSize    int      `help:"Maximum diff chunk size in bytes" default:"60000"`
+	Guidelines      []string `help:"Project guidelines to prepend" sep:","`
+	DisableComments bool     `help:"Disable posting comments (dry run)"`
+}
+
 type Config struct {
+	// Embedding ReviewConfig with a prefix changes flag names to `--review-…`.
+	// Existing configuration files using the old flag names will need to be updated.
+	// Consider keeping backwards compatibility if required.
+	Review      ReviewConfig    `embed:"" prefix:"review-"`
 	GitProvider string          `help:"Git provider (bitbucket or gitea)" env:"GIT_PROVIDER"`
 	Bitbucket   BitbucketConfig `embed:"" prefix:"bitbucket-"`
 	Gitea       GiteaConfig     `embed:"" prefix:"gitea-"`
@@ -117,7 +129,7 @@ func main() {
 		log.Fatalf("Error initializing AI: %v", err)
 	}

-	pierreService := pierre.New(ai, git)
+	pierreService := pierre.New(ai, git, cfg.Review.MaxChunkSize, cfg.Review.Guidelines, cfg.Review.DisableComments)
 	if err := pierreService.MakeReview(context.Background(), cfg.Repo.Owner, cfg.Repo.Repo, cfg.Repo.PRID); err != nil {
 		log.Fatalf("Error during review: %v", err)
 	}
@@ -7,7 +7,9 @@ require (
 	github.com/alecthomas/kong v1.14.0
 	github.com/alecthomas/kong-yaml v0.2.0
 	github.com/google/generative-ai-go v0.20.1
+	github.com/google/go-cmp v0.7.0
 	github.com/ollama/ollama v0.16.0
+	github.com/sashabaranov/go-openai v1.41.2
 	google.golang.org/api v0.186.0
 )

@@ -34,7 +36,6 @@ require (
 	github.com/googleapis/gax-go/v2 v2.12.5 // indirect
 	github.com/hashicorp/go-version v1.7.0 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
-	github.com/sashabaranov/go-openai v1.41.2 // indirect
 	github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect
 	go.opencensus.io v0.24.0 // indirect
 	go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.51.0 // indirect
1
				
			
			
				
			
			
			
				@@ -31,7 +31,7 @@ func (b *BitbucketAdapter) GetDiff(ctx context.Context, projectKey, repositorySl
 	if response.StatusCode != http.StatusOK {
 		sb := &strings.Builder{}
 		io.Copy(sb, response.Body)
-		err = fmt.Errorf("error while fetching bitbucket diff staus %d, body %s", response.Status, sb.String())
+		err = fmt.Errorf("error while fetching bitbucket diff status %d, body %s", response.StatusCode, sb.String())
 	}

 	diff = response.Body
@@ -4,10 +4,14 @@ import (
 	"context"
 	"fmt"
 	"io"
+	"strings"

 	"git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter"
 )

+// DefaultChunkSize is the fallback maximum size (in bytes) for a diff chunk when no explicit value is configured.
+const DefaultChunkSize = 60000
+
 type Comment struct {
 	File    string `json:"file"`
 	Line    int    `json:"line"`
@@ -19,21 +23,121 @@ func (s *Service) judgePR(ctx context.Context, diff io.Reader) (comments []Comme
 	if err != nil {
 		return nil, fmt.Errorf("failed to read diff: %w", err)
 	}
-	err = s.chat.GenerateStructured(ctx, []chatter.Message{
-		{
-			Role: chatter.RoleSystem,
-			Content: `
-				You are a very strict senior software architect.
-				You review **only** newly added or modified lines in a unified diff, together with the immediate hunk context.
-				You do **not** report issues that appear **solely** in deleted lines (“-”) or that have already been fixed by the change.
-				No comments are made on pure formatting/whitespace changes or reordering that does not alter the program’s behavior.
-			`,
-		},
-		{
-			Role:    chatter.RoleUser,
-			Content: fmt.Sprintf("Hello please review my PR. Write comments where improvements are necessary in new lines.\n Here is the git diff of it: %s", string(diffBytes)),
-		},
-	}, &comments)

+	// Determine chunk size (use default if not set)
+	maxSize := s.maxChunkSize
+	if maxSize <= 0 {
+		maxSize = DefaultChunkSize // default 60KB ~ 15k tokens
+	}
+
+	chunks := splitDiffIntoChunks(diffBytes, maxSize)
+	allComments := []Comment{}
+
+	// Build optional guidelines text (added as a separate section with a clear delimiter)
+	guidelinesText := ""
+	if len(s.guidelines) > 0 {
+		// Two newlines ensure the guidelines start on a fresh paragraph.
+		guidelinesText = "\n\nProject guidelines:\n"
+		for _, g := range s.guidelines {
+			guidelinesText += "- " + g + "\n"
+		}
+	}
+
+	// System prompt that instructs the LLM precisely.
+	baseSystem := strings.TrimSpace(`
+You are a strict senior software architect.
+Only comment on newly added or modified lines in the diff; ignore deletions, pure formatting, or re‑ordering that does not change behavior.
+For each issue output a JSON object with fields "file", "line", and "message" (message should be concise, ≤2 sentences, and actionable).
+If project guidelines are provided, treat them as hard rules that must be respected.`) + guidelinesText
+
+	for i, chunk := range chunks {
+		// Include the chunk identifier in the system message only if there are multiple chunks.
+		systemContent := baseSystem
+		if len(chunks) > 1 {
+			systemContent = fmt.Sprintf("%s\nChunk %d of %d.", baseSystem, i+1, len(chunks))
+		}
+		userContent := chunk
+
+		var chunkComments []Comment
+		err = s.chat.GenerateStructured(ctx, []chatter.Message{{
+			Role:    chatter.RoleSystem,
+			Content: systemContent,
+		}, {
+			Role:    chatter.RoleUser,
+			Content: userContent,
+		}}, &chunkComments)
+		if err != nil {
+			return nil, err
+		}
+		allComments = append(allComments, chunkComments...)
+	}
+
+	// De‑duplicate comments (keyed by file:line)
+	unique := make(map[string]Comment)
+	for _, c := range allComments {
+		key := fmt.Sprintf("%s:%d", c.File, c.Line)
+		unique[key] = c
+	}
+	for _, v := range unique {
+		comments = append(comments, v)
+	}
 	return
 }
+
+// splitDiffIntoChunks splits a diff into chunks that do not exceed maxSize bytes.
+// It tries to split on file boundaries ("diff --git") first, then on hunk boundaries (@@),
+// and finally on a hard byte limit.
+func splitDiffIntoChunks(diff []byte, maxSize int) []string {
+	if len(diff) <= maxSize {
+		return []string{string(diff)}
+	}
+	content := string(diff)
+	// Split by file headers
+	parts := strings.Split(content, "\ndiff --git ")
+	chunks := []string{}
+	var current strings.Builder
+	for idx, part := range parts {
+		seg := part
+		if idx != 0 {
+			// Preserve the leading newline that was removed by Split
+			seg = "\n" + "diff --git " + part
+		}
+		if current.Len()+len(seg) > maxSize && current.Len() > 0 {
+			chunks = append(chunks, current.String())
+			current.Reset()
+		}
+		if len(seg) > maxSize {
+			// Split further by hunks
+			hunks := strings.Split(seg, "\n@@ ")
+			for j, h := range hunks {
+				var hseg string
+				if j == 0 {
+					// First hunk segment already contains the preceding content (including any needed newline)
+					hseg = h
+				} else {
+					// Subsequent hunks need the leading newline and "@@ " marker restored
+					hseg = "\n@@ " + h
+				}
+				if current.Len()+len(hseg) > maxSize && current.Len() > 0 {
+					chunks = append(chunks, current.String())
+					current.Reset()
+				}
+				if len(hseg) > maxSize {
+					for len(hseg) > maxSize {
+						chunks = append(chunks, hseg[:maxSize])
+						hseg = hseg[maxSize:]
+					}
+					current.WriteString(hseg)
+				} else {
+					current.WriteString(hseg)
+				}
+			}
+		} else {
+			current.WriteString(seg)
+		}
+	}
+	if current.Len() > 0 {
+		chunks = append(chunks, current.String())
+	}
+	return chunks
+}
@@ -0,0 +1,132 @@
+package pierre
+
+import (
+	"bytes"
+	"context"
+	"io"
+	"strings"
+	"testing"
+
+	"git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter"
+	"github.com/google/go-cmp/cmp"
+)
+
+// mockChat implements the ChatAdapter interface for testing.
+type mockChat struct{ callCount int }
+
+func (m *mockChat) GenerateStructured(ctx context.Context, msgs []chatter.Message, target interface{}) error {
+	m.callCount++
+	if cSlice, ok := target.(*[]Comment); ok {
+		*cSlice = []Comment{{File: "file.go", Line: 1, Message: "test comment"}}
+		return nil
+	}
+	return nil
+}
+
+func (m *mockChat) GetProviderName() string { return "mock" }
+
+// mockGit implements the GitAdapter interface for testing.
+type mockGit struct{}
+
+func (g *mockGit) GetDiff(ctx context.Context, owner, repo string, prID int) (io.ReadCloser, error) {
+	diff := "diff --git a/file1.go b/file1.go\n+line1\n" + "diff --git a/file2.go b/file2.go\n+line2\n"
+	return io.NopCloser(bytes.NewReader([]byte(diff))), nil
+}
+
+func (g *mockGit) AddComment(ctx context.Context, owner, repo string, prID int, comment Comment) error {
+	return nil
+}
+
+func TestSplitDiffIntoChunks(t *testing.T) {
+	cases := []struct {
+		name           string
+		diff           string
+		maxSize        int
+		wantChunks     int // 0 means we don't assert exact count
+		wantPrefixes   []string
+		checkRecombine bool
+	}{
+		{
+			name:           "small diff",
+			diff:           "diff --git a/file1.txt b/file1.txt\n+added line\n",
+			maxSize:        1000,
+			wantChunks:     1,
+			wantPrefixes:   []string{"diff --git a/file1.txt"},
+			checkRecombine: true,
+		},
+		{
+			name: "multiple files",
+			diff: "diff --git a/file1.txt b/file1.txt\n+added line 1\n" +
+				"diff --git a/file2.txt b/file2.txt\n+added line 2\n",
+			maxSize:        50,
+			wantChunks:     2,
+			wantPrefixes:   []string{"diff --git a/file1.txt", "diff --git a/file2.txt"},
+			checkRecombine: false,
+		},
+		{
+			name: "large single file",
+			diff: func() string {
+				line := "+very long added line that will be repeated many times to exceed the chunk size\n"
+				return "diff --git a/large.txt b/large.txt\n" + strings.Repeat(line, 200)
+			}(),
+			maxSize:        500,
+			wantChunks:     0,
+			wantPrefixes:   nil,
+			checkRecombine: true,
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			chunks := splitDiffIntoChunks([]byte(tc.diff), tc.maxSize)
+			if tc.wantChunks > 0 && len(chunks) != tc.wantChunks {
+				t.Fatalf("expected %d chunks, got %d", tc.wantChunks, len(chunks))
+			}
+			for i, prefix := range tc.wantPrefixes {
+				if i >= len(chunks) {
+					t.Fatalf("missing chunk %d for prefix check", i)
+				}
+				trimmed := strings.TrimPrefix(chunks[i], "\n")
+				if !strings.HasPrefix(trimmed, prefix) {
+					t.Fatalf("chunk %d does not start with expected prefix %q: %s", i, prefix, chunks[i])
+				}
+			}
+			for i, c := range chunks {
+				if tc.maxSize > 0 && len(c) > tc.maxSize {
+					t.Fatalf("chunk %d exceeds max size %d: %d", i, tc.maxSize, len(c))
+				}
+			}
+			if tc.checkRecombine {
+				recombined := strings.Join(chunks, "")
+				if diff := cmp.Diff(tc.diff, recombined); diff != "" {
+					t.Fatalf("recombined diff differs:\n%s", diff)
+				}
+			}
+		})
+	}
+}
+
+func TestJudgePR_ChunkAggregationAndDeduplication(t *testing.T) {
+	chatMock := &mockChat{}
+	svc := &Service{
+		maxChunkSize: 50,
+		guidelines:   nil,
+		git:          &mockGit{},
+		chat:         chatMock,
+	}
+	diffReader, err := svc.git.GetDiff(context.Background(), "", "", 0)
+	if err != nil {
+		t.Fatalf("failed to get diff: %v", err)
+	}
+	defer diffReader.Close()
+	comments, err := svc.judgePR(context.Background(), diffReader)
+	if err != nil {
+		t.Fatalf("judgePR error: %v", err)
+	}
+	if got, want := len(comments), 1; got != want {
+		t.Fatalf("expected %d comment after deduplication, got %d", want, got)
+	}
+	if chatMock.callCount != 2 {
+		t.Fatalf("expected mockChat to be called for each chunk (2), got %d", chatMock.callCount)
+	}
+}
@@ -7,15 +7,26 @@ import (
 	"git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter"
 )

+// Service holds the core collaborators and configuration for Pierre.
+// The order of the fields is intentional: configuration fields first (used
+// during initialization) followed by the adapters. This prevents accidental
+// changes to the serialized layout if encoding/gob or encoding/json is used
+// elsewhere in the future.
 type Service struct {
-	git  GitAdapter
-	chat ChatAdapter
+	maxChunkSize    int
+	guidelines      []string
+	disableComments bool
+	git             GitAdapter
+	chat            ChatAdapter
 }

-func New(chat ChatAdapter, git GitAdapter) *Service {
+func New(chat ChatAdapter, git GitAdapter, maxChunkSize int, guidelines []string, disableComments bool) *Service {
 	return &Service{
-		git:  git,
-		chat: chat,
+		git:             git,
+		chat:            chat,
+		maxChunkSize:    maxChunkSize,
+		guidelines:      guidelines,
+		disableComments: disableComments,
 	}
 }

@@ -26,11 +26,17 @@ func (s *Service) MakeReview(ctx context.Context, organisation string, repo stri

 	for _, c := range comments {
 		c.Message = fmt.Sprintf("%s (Generated by: %s)", c.Message, model)
-		fmt.Printf("File: %s\nLine: %d\nMessage: %s\n%s\n",
-			c.File, c.Line, c.Message, "---")

-		if err := s.git.AddComment(ctx, organisation, repo, prID, c); err != nil {
-			log.Printf("Failed to add comment: %v", err)
+		if s.disableComments {
+			// Dry‑run: just log what would have been posted.
+			log.Printf("dry‑run: %s:%d => %s", c.File, c.Line, c.Message)
+		} else {
+			// Normal mode: print to stdout and post the comment to the VCS.
+			fmt.Printf("File: %s\nLine: %d\nMessage: %s\n%s\n",
+				c.File, c.Line, c.Message, "---")
+			if err := s.git.AddComment(ctx, organisation, repo, prID, c); err != nil {
+				log.Printf("Failed to add comment: %v", err)
+			}
 		}
 	}