[GITEA] Improved Linguist compatibility
Recognise the `linguist-documentation` and `linguist-detectable`
attributes in `.gitattributes` files, and use them in
`GetLanguageStats()` to make a decision whether to include a particular
file in the stats or not.
This allows one more control over which files in their repositories
contribute toward the language statistics, so that for a project that is
mostly documentation, the language stats can reflect that.
Fixes #1672.
Signed-off-by: Gergely Nagy <forgejo@gergo.csillger.hu>
(cherry picked from commit 6d4e02fe5f
)
This commit is contained in:
parent
119d10d9e2
commit
ee1ead8189
5 changed files with 341 additions and 28 deletions
|
@ -291,7 +291,7 @@ func (repo *Repository) CheckAttributeReader(commitID string) (*CheckAttributeRe
|
|||
}
|
||||
|
||||
checker := &CheckAttributeReader{
|
||||
Attributes: []string{"linguist-vendored", "linguist-generated", "linguist-language", "gitlab-language"},
|
||||
Attributes: []string{"linguist-vendored", "linguist-generated", "linguist-language", "gitlab-language", "linguist-documentation", "linguist-detectable"},
|
||||
Repo: repo,
|
||||
IndexFile: indexFilename,
|
||||
WorkTree: worktree,
|
||||
|
|
|
@ -13,6 +13,18 @@ const (
|
|||
bigFileSize int64 = 1024 * 1024 // 1 MiB
|
||||
)
|
||||
|
||||
type LinguistBoolAttrib struct {
|
||||
Value string
|
||||
}
|
||||
|
||||
func (attrib *LinguistBoolAttrib) IsTrue() bool {
|
||||
return attrib.Value == "set" || attrib.Value == "true"
|
||||
}
|
||||
|
||||
func (attrib *LinguistBoolAttrib) IsFalse() bool {
|
||||
return attrib.Value == "unset" || attrib.Value == "false"
|
||||
}
|
||||
|
||||
// mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used.
|
||||
func mergeLanguageStats(stats map[string]int64) map[string]int64 {
|
||||
names := map[string]struct {
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// Copyright 2024 The Forgejo Authors c/o Codeberg e.V.. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
//go:build gogit
|
||||
|
@ -57,23 +58,25 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
|
|||
return nil
|
||||
}
|
||||
|
||||
notVendored := false
|
||||
notGenerated := false
|
||||
isVendored := LinguistBoolAttrib{}
|
||||
isGenerated := LinguistBoolAttrib{}
|
||||
isDocumentation := LinguistBoolAttrib{}
|
||||
isDetectable := LinguistBoolAttrib{}
|
||||
|
||||
if checker != nil {
|
||||
attrs, err := checker.CheckPath(f.Name)
|
||||
if err == nil {
|
||||
if vendored, has := attrs["linguist-vendored"]; has {
|
||||
if vendored == "set" || vendored == "true" {
|
||||
return nil
|
||||
}
|
||||
notVendored = vendored == "false"
|
||||
isVendored = LinguistBoolAttrib{Value: vendored}
|
||||
}
|
||||
if generated, has := attrs["linguist-generated"]; has {
|
||||
if generated == "set" || generated == "true" {
|
||||
return nil
|
||||
isGenerated = LinguistBoolAttrib{Value: generated}
|
||||
}
|
||||
notGenerated = generated == "false"
|
||||
if documentation, has := attrs["linguist-documentation"]; has {
|
||||
isDocumentation = LinguistBoolAttrib{Value: documentation}
|
||||
}
|
||||
if detectable, has := attrs["linguist-detectable"]; has {
|
||||
isDetectable = LinguistBoolAttrib{Value: detectable}
|
||||
}
|
||||
if language, has := attrs["linguist-language"]; has && language != "unspecified" && language != "" {
|
||||
// group languages, such as Pug -> HTML; SCSS -> CSS
|
||||
|
@ -105,8 +108,11 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
|
|||
}
|
||||
}
|
||||
|
||||
if (!notVendored && analyze.IsVendor(f.Name)) || enry.IsDotFile(f.Name) ||
|
||||
enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) {
|
||||
if isDetectable.IsFalse() || isVendored.IsTrue() || isDocumentation.IsTrue() ||
|
||||
(!isVendored.IsFalse() && analyze.IsVendor(f.Name)) ||
|
||||
enry.IsDotFile(f.Name) ||
|
||||
enry.IsConfiguration(f.Name) ||
|
||||
(!isDocumentation.IsFalse() && enry.IsDocumentation(f.Name)) {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -115,12 +121,11 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
|
|||
if f.Size <= bigFileSize {
|
||||
content, _ = readFile(f, fileSizeLimit)
|
||||
}
|
||||
if !notGenerated && enry.IsGenerated(f.Name, content) {
|
||||
if !isGenerated.IsTrue() && enry.IsGenerated(f.Name, content) {
|
||||
return nil
|
||||
}
|
||||
|
||||
// TODO: Use .gitattributes file for linguist overrides
|
||||
|
||||
language := analyze.GetCodeLanguage(f.Name, content)
|
||||
if language == enry.OtherLanguage || language == "" {
|
||||
return nil
|
||||
|
@ -136,6 +141,13 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
|
|||
if !checked {
|
||||
langtype := enry.GetLanguageType(language)
|
||||
included = langtype == enry.Programming || langtype == enry.Markup
|
||||
if !included {
|
||||
if isDetectable.IsTrue() {
|
||||
included = true
|
||||
} else {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
includedLanguage[language] = included
|
||||
}
|
||||
if included {
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
// Copyright 2020 The Gitea Authors. All rights reserved.
|
||||
// Copyright 2024 The Forgejo Authors c/o Codeberg e.V.. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
//go:build !gogit
|
||||
|
@ -90,23 +91,25 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
|
|||
continue
|
||||
}
|
||||
|
||||
notVendored := false
|
||||
notGenerated := false
|
||||
isVendored := LinguistBoolAttrib{}
|
||||
isGenerated := LinguistBoolAttrib{}
|
||||
isDocumentation := LinguistBoolAttrib{}
|
||||
isDetectable := LinguistBoolAttrib{}
|
||||
|
||||
if checker != nil {
|
||||
attrs, err := checker.CheckPath(f.Name())
|
||||
if err == nil {
|
||||
if vendored, has := attrs["linguist-vendored"]; has {
|
||||
if vendored == "set" || vendored == "true" {
|
||||
continue
|
||||
}
|
||||
notVendored = vendored == "false"
|
||||
isVendored = LinguistBoolAttrib{Value: vendored}
|
||||
}
|
||||
if generated, has := attrs["linguist-generated"]; has {
|
||||
if generated == "set" || generated == "true" {
|
||||
continue
|
||||
isGenerated = LinguistBoolAttrib{Value: generated}
|
||||
}
|
||||
notGenerated = generated == "false"
|
||||
if documentation, has := attrs["linguist-documentation"]; has {
|
||||
isDocumentation = LinguistBoolAttrib{Value: documentation}
|
||||
}
|
||||
if detectable, has := attrs["linguist-detectable"]; has {
|
||||
isDetectable = LinguistBoolAttrib{Value: detectable}
|
||||
}
|
||||
if language, has := attrs["linguist-language"]; has && language != "unspecified" && language != "" {
|
||||
// group languages, such as Pug -> HTML; SCSS -> CSS
|
||||
|
@ -139,8 +142,11 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
|
|||
}
|
||||
}
|
||||
|
||||
if (!notVendored && analyze.IsVendor(f.Name())) || enry.IsDotFile(f.Name()) ||
|
||||
enry.IsDocumentation(f.Name()) || enry.IsConfiguration(f.Name()) {
|
||||
if isDetectable.IsFalse() || isVendored.IsTrue() || isDocumentation.IsTrue() ||
|
||||
(!isVendored.IsFalse() && analyze.IsVendor(f.Name())) ||
|
||||
enry.IsDotFile(f.Name()) ||
|
||||
enry.IsConfiguration(f.Name()) ||
|
||||
(!isDocumentation.IsFalse() && enry.IsDocumentation(f.Name())) {
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -173,7 +179,7 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
|
|||
return nil, err
|
||||
}
|
||||
}
|
||||
if !notGenerated && enry.IsGenerated(f.Name(), content) {
|
||||
if !isGenerated.IsTrue() && enry.IsGenerated(f.Name(), content) {
|
||||
continue
|
||||
}
|
||||
|
||||
|
@ -194,6 +200,13 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, err
|
|||
if !checked {
|
||||
langType := enry.GetLanguageType(language)
|
||||
included = langType == enry.Programming || langType == enry.Markup
|
||||
if !included {
|
||||
if isDetectable.IsTrue() {
|
||||
included = true
|
||||
} else {
|
||||
continue
|
||||
}
|
||||
}
|
||||
includedLanguage[language] = included
|
||||
}
|
||||
if included {
|
||||
|
|
276
tests/integration/repo_lang_stats_test.go
Normal file
276
tests/integration/repo_lang_stats_test.go
Normal file
|
@ -0,0 +1,276 @@
|
|||
// Copyright 2024 The Forgejo Authors c/o Codeberg e.V.. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package integration
|
||||
|
||||
import (
|
||||
"context"
|
||||
"net/url"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"code.gitea.io/gitea/models/db"
|
||||
repo_model "code.gitea.io/gitea/models/repo"
|
||||
"code.gitea.io/gitea/models/unittest"
|
||||
user_model "code.gitea.io/gitea/models/user"
|
||||
"code.gitea.io/gitea/modules/git"
|
||||
"code.gitea.io/gitea/modules/indexer/stats"
|
||||
"code.gitea.io/gitea/modules/queue"
|
||||
repo_service "code.gitea.io/gitea/services/repository"
|
||||
files_service "code.gitea.io/gitea/services/repository/files"
|
||||
"code.gitea.io/gitea/tests"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func createLangStatTestRepo(t *testing.T) (*repo_model.Repository, func()) {
|
||||
t.Helper()
|
||||
|
||||
user2 := unittest.AssertExistsAndLoadBean(t, &user_model.User{ID: 2})
|
||||
|
||||
// Create a new repository
|
||||
repo, err := repo_service.CreateRepository(db.DefaultContext, user2, user2, repo_service.CreateRepoOptions{
|
||||
Name: "lang-stat-test",
|
||||
Description: "minimal repo for language stats testing",
|
||||
AutoInit: true,
|
||||
Gitignores: "Go",
|
||||
License: "MIT",
|
||||
Readme: "Default",
|
||||
DefaultBranch: "main",
|
||||
IsPrivate: false,
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
assert.NotEmpty(t, repo)
|
||||
|
||||
return repo, func() {
|
||||
repo_service.DeleteRepository(db.DefaultContext, user2, repo, false)
|
||||
}
|
||||
}
|
||||
|
||||
func addLangStatTestFiles(t *testing.T, repo *repo_model.Repository, contents string) string {
|
||||
t.Helper()
|
||||
|
||||
owner := unittest.AssertExistsAndLoadBean(t, &user_model.User{ID: repo.OwnerID})
|
||||
|
||||
addFilesResp, err := files_service.ChangeRepoFiles(git.DefaultContext, repo, owner, &files_service.ChangeRepoFilesOptions{
|
||||
Files: []*files_service.ChangeRepoFile{
|
||||
{
|
||||
Operation: "create",
|
||||
TreePath: ".gitattributes",
|
||||
ContentReader: strings.NewReader(contents),
|
||||
},
|
||||
{
|
||||
Operation: "create",
|
||||
TreePath: "docs.md",
|
||||
ContentReader: strings.NewReader("This **is** a `markdown` file.\n"),
|
||||
},
|
||||
{
|
||||
Operation: "create",
|
||||
TreePath: "foo.c",
|
||||
ContentReader: strings.NewReader(`#include <stdio.h>\nint main() {\n printf("Hello world!\n");\n return 0;\n}\n`),
|
||||
},
|
||||
{
|
||||
Operation: "create",
|
||||
TreePath: "foo.nib",
|
||||
ContentReader: strings.NewReader("Pinky promise, this is not a generated file!\n"),
|
||||
},
|
||||
{
|
||||
Operation: "create",
|
||||
TreePath: ".dot.pas",
|
||||
ContentReader: strings.NewReader("program Hello;\nbegin\n writeln('Hello, world.');\nend.\n"),
|
||||
},
|
||||
{
|
||||
Operation: "create",
|
||||
TreePath: "cpplint.py",
|
||||
ContentReader: strings.NewReader(`#! /usr/bin/env python\n\nprint("Hello world!")\n`),
|
||||
},
|
||||
{
|
||||
Operation: "create",
|
||||
TreePath: "some-file.xml",
|
||||
ContentReader: strings.NewReader(`<?xml version="1.0"?>\n<foo>\n <bar>Hello</bar>\n</foo>\n`),
|
||||
},
|
||||
},
|
||||
Message: "add files",
|
||||
OldBranch: "main",
|
||||
NewBranch: "main",
|
||||
Author: &files_service.IdentityOptions{
|
||||
Name: owner.Name,
|
||||
Email: owner.Email,
|
||||
},
|
||||
Committer: &files_service.IdentityOptions{
|
||||
Name: owner.Name,
|
||||
Email: owner.Email,
|
||||
},
|
||||
Dates: &files_service.CommitDateOptions{
|
||||
Author: time.Now(),
|
||||
Committer: time.Now(),
|
||||
},
|
||||
})
|
||||
assert.NoError(t, err)
|
||||
assert.NotEmpty(t, addFilesResp)
|
||||
|
||||
return addFilesResp.Commit.SHA
|
||||
}
|
||||
|
||||
func TestRepoLangStats(t *testing.T) {
|
||||
onGiteaRun(t, func(t *testing.T, u *url.URL) {
|
||||
/******************
|
||||
** Preparations **
|
||||
******************/
|
||||
prep := func(t *testing.T, attribs string) (*repo_model.Repository, string, func()) {
|
||||
t.Helper()
|
||||
|
||||
repo, f := createLangStatTestRepo(t)
|
||||
sha := addLangStatTestFiles(t, repo, attribs)
|
||||
|
||||
return repo, sha, f
|
||||
}
|
||||
|
||||
getFreshLanguageStats := func(t *testing.T, repo *repo_model.Repository, sha string) repo_model.LanguageStatList {
|
||||
t.Helper()
|
||||
|
||||
err := stats.UpdateRepoIndexer(repo)
|
||||
assert.NoError(t, err)
|
||||
|
||||
assert.NoError(t, queue.GetManager().FlushAll(context.Background(), 10*time.Second))
|
||||
|
||||
status, err := repo_model.GetIndexerStatus(db.DefaultContext, repo, repo_model.RepoIndexerTypeStats)
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, sha, status.CommitSha)
|
||||
langs, err := repo_model.GetTopLanguageStats(db.DefaultContext, repo, 5)
|
||||
assert.NoError(t, err)
|
||||
|
||||
return langs
|
||||
}
|
||||
|
||||
/***********
|
||||
** Tests **
|
||||
***********/
|
||||
|
||||
// 1. By default, documentation is not indexed
|
||||
t.Run("default", func(t *testing.T) {
|
||||
defer tests.PrintCurrentTest(t)()
|
||||
|
||||
repo, sha, f := prep(t, "")
|
||||
defer f()
|
||||
|
||||
langs := getFreshLanguageStats(t, repo, sha)
|
||||
|
||||
// While this is a fairly short test, this exercises a number of
|
||||
// things:
|
||||
//
|
||||
// - `.gitattributes` is empty, so `isDetectable.IsFalse()`,
|
||||
// `isVendored.IsTrue()`, and `isDocumentation.IsTrue()` will be
|
||||
// false for every file, because these are only true if an
|
||||
// attribute is explicitly set.
|
||||
//
|
||||
// - There is `.dot.pas`, which would be considered Pascal source,
|
||||
// but it is a dotfile (thus, `enry.IsDotFile()` applies), and as
|
||||
// such, is not considered.
|
||||
//
|
||||
// - `some-file.xml` will be skipped because Enry considers XML
|
||||
// configuration, and `enry.IsConfiguration()` will catch it.
|
||||
//
|
||||
// - `!isVendored.IsFalse()` evaluates to true, so
|
||||
// `analyze.isVendor()` will be called on `cpplint.py`, which will
|
||||
// be considered vendored, even though both the filename and
|
||||
// contents would otherwise make it Python.
|
||||
//
|
||||
// - `!isDocumentation.IsFalse()` evaluates to true, so
|
||||
// `enry.IsDocumentation()` will be called for `docs.md`, and will
|
||||
// be considered documentation, thus, skipped.
|
||||
//
|
||||
// Thus, this exercises all of the conditions in the first big if
|
||||
// that is supposed to filter out files early. With two short asserts!
|
||||
|
||||
assert.Len(t, langs, 1)
|
||||
assert.Equal(t, "C", langs[0].Language)
|
||||
})
|
||||
|
||||
// 2. Marking foo.c as non-detectable
|
||||
t.Run("foo.c non-detectable", func(t *testing.T) {
|
||||
defer tests.PrintCurrentTest(t)()
|
||||
|
||||
repo, sha, f := prep(t, "foo.c linguist-detectable=false\n")
|
||||
defer f()
|
||||
|
||||
langs := getFreshLanguageStats(t, repo, sha)
|
||||
assert.Empty(t, langs)
|
||||
})
|
||||
|
||||
// 3. Marking Markdown detectable
|
||||
t.Run("detectable markdown", func(t *testing.T) {
|
||||
defer tests.PrintCurrentTest(t)()
|
||||
|
||||
repo, sha, f := prep(t, "*.md linguist-detectable\n")
|
||||
defer f()
|
||||
|
||||
langs := getFreshLanguageStats(t, repo, sha)
|
||||
assert.Len(t, langs, 2)
|
||||
assert.Equal(t, "C", langs[0].Language)
|
||||
assert.Equal(t, "Markdown", langs[1].Language)
|
||||
})
|
||||
|
||||
// 4. Marking foo.c as documentation
|
||||
t.Run("foo.c as documentation", func(t *testing.T) {
|
||||
defer tests.PrintCurrentTest(t)()
|
||||
|
||||
repo, sha, f := prep(t, "foo.c linguist-documentation\n")
|
||||
defer f()
|
||||
|
||||
langs := getFreshLanguageStats(t, repo, sha)
|
||||
assert.Empty(t, langs)
|
||||
})
|
||||
|
||||
// 5. Overriding a generated file
|
||||
t.Run("linguist-generated=false", func(t *testing.T) {
|
||||
defer tests.PrintCurrentTest(t)()
|
||||
|
||||
repo, sha, f := prep(t, "foo.nib linguist-generated=false\nfoo.nib linguist-language=Perl\n")
|
||||
defer f()
|
||||
|
||||
langs := getFreshLanguageStats(t, repo, sha)
|
||||
assert.Len(t, langs, 2)
|
||||
assert.Equal(t, "C", langs[0].Language)
|
||||
assert.Equal(t, "Perl", langs[1].Language)
|
||||
})
|
||||
|
||||
// 6. Disabling vendoring for a file
|
||||
t.Run("linguist-vendored=false", func(t *testing.T) {
|
||||
defer tests.PrintCurrentTest(t)()
|
||||
|
||||
repo, sha, f := prep(t, "cpplint.py linguist-vendored=false\n")
|
||||
defer f()
|
||||
|
||||
langs := getFreshLanguageStats(t, repo, sha)
|
||||
assert.Len(t, langs, 2)
|
||||
assert.Equal(t, "C", langs[0].Language)
|
||||
assert.Equal(t, "Python", langs[1].Language)
|
||||
})
|
||||
|
||||
// 7. Disabling vendoring for a file, with -linguist-vendored
|
||||
t.Run("-linguist-vendored", func(t *testing.T) {
|
||||
defer tests.PrintCurrentTest(t)()
|
||||
|
||||
repo, sha, f := prep(t, "cpplint.py -linguist-vendored\n")
|
||||
defer f()
|
||||
|
||||
langs := getFreshLanguageStats(t, repo, sha)
|
||||
assert.Len(t, langs, 2)
|
||||
assert.Equal(t, "C", langs[0].Language)
|
||||
assert.Equal(t, "Python", langs[1].Language)
|
||||
})
|
||||
|
||||
// 8. Marking foo.c as vendored
|
||||
t.Run("foo.c as vendored", func(t *testing.T) {
|
||||
defer tests.PrintCurrentTest(t)()
|
||||
|
||||
repo, sha, f := prep(t, "foo.c linguist-vendored\n")
|
||||
defer f()
|
||||
|
||||
langs := getFreshLanguageStats(t, repo, sha)
|
||||
assert.Empty(t, langs)
|
||||
})
|
||||
})
|
||||
}
|
Loading…
Reference in a new issue