pierre-bot/internal/pierre/judge.go

package pierre

import (
	"context"
	"fmt"
	"io"
	"strings"

	"git.schreifuchs.ch/schreifuchs/pierre-bot/internal/chatter"
)

const defaultChunkSize = 60000

type Comment struct {
	File    string `json:"file"`
	Line    int    `json:"line"`
	Message string `json:"message"`
}

func (s *Service) judgePR(ctx context.Context, diff io.Reader) (comments []Comment, err error) {
	diffBytes, err := io.ReadAll(diff)
	if err != nil {
		return nil, fmt.Errorf("failed to read diff: %w", err)
	}

	// Determine chunk size (use default if not set)
	maxSize := s.maxChunkSize
	if maxSize <= 0 {
		maxSize = defaultChunkSize // default 60KB ~ 15k tokens
	}

	chunks := splitDiffIntoChunks(diffBytes, maxSize)
	allComments := []Comment{}

	// Build optional guidelines text
	guidelinesText := ""
	if len(s.guidelines) > 0 {
		guidelinesText = "Project guidelines:\n"
		for _, g := range s.guidelines {
			guidelinesText += "- " + g + "\n"
		}
	}

	baseSystem := strings.TrimSpace(`
You are a very strict senior software architect.
You review **only** newly added or modified lines in a unified diff, together with the immediate hunk context.
You do **not** report issues that appear **solely** in deleted lines (“-") or that have already been fixed by the change.
No comments are made on pure formatting/whitespace changes or reordering that does not alter the program’s behavior.`) + guidelinesText

	for i, chunk := range chunks {
		// Add a small header so the model knows this is a fragment
		header := fmt.Sprintf("\n--- Chunk %d of %d ---\n", i+1, len(chunks))
		userContent := fmt.Sprintf("Hello please review my PR. Write comments where improvements are necessary in new lines.%s\nHere is the git diff of it: %s", header, chunk)

		var chunkComments []Comment
		err = s.chat.GenerateStructured(ctx, []chatter.Message{{
			Role:    chatter.RoleSystem,
			Content: baseSystem,
		}, {
			Role:    chatter.RoleUser,
			Content: userContent,
		}}, &chunkComments)
		if err != nil {
			return nil, err
		}
		allComments = append(allComments, chunkComments...)
	}

	// De‑duplicate comments (keyed by file:line)
	unique := make(map[string]Comment)
	for _, c := range allComments {
		key := fmt.Sprintf("%s:%d", c.File, c.Line)
		unique[key] = c
	}
	for _, v := range unique {
		comments = append(comments, v)
	}
	return
}

// splitDiffIntoChunks splits a diff into chunks that do not exceed maxSize bytes.
// It tries to split on file boundaries ("diff --git") first, then on hunk boundaries (@@),
// and finally on a hard byte limit.
func splitDiffIntoChunks(diff []byte, maxSize int) []string {
	if len(diff) <= maxSize {
		return []string{string(diff)}
	}
	content := string(diff)
	// Split by file headers
	parts := strings.Split(content, "\ndiff --git ")
	chunks := []string{}
	var current strings.Builder
	for idx, part := range parts {
		seg := part
		if idx != 0 {
			// Preserve the leading newline that was removed by Split
			seg = "\n" + "diff --git " + part
		}
		if current.Len()+len(seg) > maxSize && current.Len() > 0 {
			chunks = append(chunks, current.String())
			current.Reset()
		}
		if len(seg) > maxSize {
			// Split further by hunks
			hunks := strings.Split(seg, "\n@@ ")
			for j, h := range hunks {
				var hseg string
				if j == 0 {
					// First hunk segment already contains the preceding content (including any needed newline)
					hseg = h
				} else {
					// Subsequent hunks need the leading newline and "@@ " marker restored
					hseg = "\n@@ " + h
				}
				if current.Len()+len(hseg) > maxSize && current.Len() > 0 {
					chunks = append(chunks, current.String())
					current.Reset()
				}
				if len(hseg) > maxSize {
					for len(hseg) > maxSize {
						chunks = append(chunks, hseg[:maxSize])
						hseg = hseg[maxSize:]
					}
					current.WriteString(hseg)
				} else {
					current.WriteString(hseg)
				}
			}
		} else {
			current.WriteString(seg)
		}
	}
	if current.Len() > 0 {
		chunks = append(chunks, current.String())
	}
	return chunks
}