Detect and remove a decoded BOM when showing content. Restore the previous encoding and BOM when updating content. On error keep as UTF-8 encoding. Signed-off-by: Andrew Thornton <art27@cantab.net>
This commit is contained in:
parent
8b3aad940e
commit
21fb791747
3 changed files with 127 additions and 7 deletions
|
@ -5,6 +5,7 @@
|
||||||
package base
|
package base
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"crypto/md5"
|
"crypto/md5"
|
||||||
"crypto/rand"
|
"crypto/rand"
|
||||||
"crypto/sha1"
|
"crypto/sha1"
|
||||||
|
@ -32,6 +33,9 @@ import (
|
||||||
"github.com/gogits/chardet"
|
"github.com/gogits/chardet"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// UTF8BOM is the utf-8 byte-order marker
|
||||||
|
var UTF8BOM = []byte{'\xef', '\xbb', '\xbf'}
|
||||||
|
|
||||||
// EncodeMD5 encodes string to md5 hex value.
|
// EncodeMD5 encodes string to md5 hex value.
|
||||||
func EncodeMD5(str string) string {
|
func EncodeMD5(str string) string {
|
||||||
m := md5.New()
|
m := md5.New()
|
||||||
|
@ -87,6 +91,14 @@ func DetectEncoding(content []byte) (string, error) {
|
||||||
return result.Charset, err
|
return result.Charset, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// RemoveBOMIfPresent removes a UTF-8 BOM from a []byte
|
||||||
|
func RemoveBOMIfPresent(content []byte) []byte {
|
||||||
|
if len(content) > 2 && bytes.Equal(content[0:3], UTF8BOM) {
|
||||||
|
return content[3:]
|
||||||
|
}
|
||||||
|
return content
|
||||||
|
}
|
||||||
|
|
||||||
// BasicAuthDecode decode basic auth string
|
// BasicAuthDecode decode basic auth string
|
||||||
func BasicAuthDecode(encoded string) (string, string, error) {
|
func BasicAuthDecode(encoded string) (string, string, error) {
|
||||||
s, err := base64.StdEncoding.DecodeString(encoded)
|
s, err := base64.StdEncoding.DecodeString(encoded)
|
||||||
|
|
|
@ -267,7 +267,7 @@ func ToUTF8WithErr(content []byte) (string, error) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", err
|
return "", err
|
||||||
} else if charsetLabel == "UTF-8" {
|
} else if charsetLabel == "UTF-8" {
|
||||||
return string(content), nil
|
return string(base.RemoveBOMIfPresent(content)), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
encoding, _ := charset.Lookup(charsetLabel)
|
encoding, _ := charset.Lookup(charsetLabel)
|
||||||
|
@ -277,19 +277,21 @@ func ToUTF8WithErr(content []byte) (string, error) {
|
||||||
|
|
||||||
// If there is an error, we concatenate the nicely decoded part and the
|
// If there is an error, we concatenate the nicely decoded part and the
|
||||||
// original left over. This way we won't lose data.
|
// original left over. This way we won't lose data.
|
||||||
result, n, err := transform.String(encoding.NewDecoder(), string(content))
|
result, n, err := transform.Bytes(encoding.NewDecoder(), content)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
result = result + string(content[n:])
|
result = append(result, content[n:]...)
|
||||||
}
|
}
|
||||||
|
|
||||||
return result, err
|
result = base.RemoveBOMIfPresent(result)
|
||||||
|
|
||||||
|
return string(result), err
|
||||||
}
|
}
|
||||||
|
|
||||||
// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
|
// ToUTF8WithFallback detects the encoding of content and coverts to UTF-8 if possible
|
||||||
func ToUTF8WithFallback(content []byte) []byte {
|
func ToUTF8WithFallback(content []byte) []byte {
|
||||||
charsetLabel, err := base.DetectEncoding(content)
|
charsetLabel, err := base.DetectEncoding(content)
|
||||||
if err != nil || charsetLabel == "UTF-8" {
|
if err != nil || charsetLabel == "UTF-8" {
|
||||||
return content
|
return base.RemoveBOMIfPresent(content)
|
||||||
}
|
}
|
||||||
|
|
||||||
encoding, _ := charset.Lookup(charsetLabel)
|
encoding, _ := charset.Lookup(charsetLabel)
|
||||||
|
@ -304,7 +306,7 @@ func ToUTF8WithFallback(content []byte) []byte {
|
||||||
return append(result, content[n:]...)
|
return append(result, content[n:]...)
|
||||||
}
|
}
|
||||||
|
|
||||||
return result
|
return base.RemoveBOMIfPresent(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ToUTF8 converts content to UTF8 encoding and ignore error
|
// ToUTF8 converts content to UTF8 encoding and ignore error
|
||||||
|
|
|
@ -5,15 +5,85 @@
|
||||||
package uploader
|
package uploader
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"fmt"
|
"fmt"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"golang.org/x/net/html/charset"
|
||||||
|
"golang.org/x/text/transform"
|
||||||
|
|
||||||
"code.gitea.io/git"
|
"code.gitea.io/git"
|
||||||
"code.gitea.io/gitea/models"
|
"code.gitea.io/gitea/models"
|
||||||
|
"code.gitea.io/gitea/modules/base"
|
||||||
"code.gitea.io/gitea/modules/lfs"
|
"code.gitea.io/gitea/modules/lfs"
|
||||||
|
"code.gitea.io/gitea/modules/log"
|
||||||
"code.gitea.io/gitea/modules/setting"
|
"code.gitea.io/gitea/modules/setting"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func detectEncodingAndBOM(entry *git.TreeEntry, repo *models.Repository) (string, bool) {
|
||||||
|
reader, err := entry.Blob().DataAsync()
|
||||||
|
if err != nil {
|
||||||
|
// return default
|
||||||
|
return "UTF-8", false
|
||||||
|
}
|
||||||
|
defer reader.Close()
|
||||||
|
buf := make([]byte, 1024)
|
||||||
|
n, err := reader.Read(buf)
|
||||||
|
if err != nil {
|
||||||
|
// return default
|
||||||
|
return "UTF-8", false
|
||||||
|
}
|
||||||
|
buf = buf[:n]
|
||||||
|
|
||||||
|
if setting.LFS.StartServer {
|
||||||
|
meta := lfs.IsPointerFile(&buf)
|
||||||
|
if meta != nil {
|
||||||
|
meta, err = repo.GetLFSMetaObjectByOid(meta.Oid)
|
||||||
|
if err != nil && err != models.ErrLFSObjectNotExist {
|
||||||
|
// return default
|
||||||
|
return "UTF-8", false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if meta != nil {
|
||||||
|
dataRc, err := lfs.ReadMetaObject(meta)
|
||||||
|
if err != nil {
|
||||||
|
// return default
|
||||||
|
return "UTF-8", false
|
||||||
|
}
|
||||||
|
defer dataRc.Close()
|
||||||
|
buf = make([]byte, 1024)
|
||||||
|
n, err = dataRc.Read(buf)
|
||||||
|
if err != nil {
|
||||||
|
// return default
|
||||||
|
return "UTF-8", false
|
||||||
|
}
|
||||||
|
buf = buf[:n]
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
encoding, err := base.DetectEncoding(buf)
|
||||||
|
if err != nil {
|
||||||
|
// just default to utf-8 and no bom
|
||||||
|
return "UTF-8", false
|
||||||
|
}
|
||||||
|
if encoding == "UTF-8" {
|
||||||
|
return encoding, bytes.Equal(buf[0:3], base.UTF8BOM)
|
||||||
|
}
|
||||||
|
charsetEncoding, _ := charset.Lookup(encoding)
|
||||||
|
if charsetEncoding == nil {
|
||||||
|
return "UTF-8", false
|
||||||
|
}
|
||||||
|
|
||||||
|
result, n, err := transform.String(charsetEncoding.NewDecoder(), string(buf))
|
||||||
|
|
||||||
|
if n > 2 {
|
||||||
|
return encoding, bytes.Equal([]byte(result)[0:3], base.UTF8BOM)
|
||||||
|
}
|
||||||
|
|
||||||
|
return encoding, false
|
||||||
|
}
|
||||||
|
|
||||||
// UpdateRepoFileOptions holds the repository file update options
|
// UpdateRepoFileOptions holds the repository file update options
|
||||||
type UpdateRepoFileOptions struct {
|
type UpdateRepoFileOptions struct {
|
||||||
LastCommitID string
|
LastCommitID string
|
||||||
|
@ -45,12 +115,29 @@ func UpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepo
|
||||||
return fmt.Errorf("UpdateRepoFile: %v", err)
|
return fmt.Errorf("UpdateRepoFile: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
encoding := "UTF-8"
|
||||||
|
bom := false
|
||||||
|
|
||||||
if opts.IsNewFile {
|
if opts.IsNewFile {
|
||||||
for _, file := range filesInIndex {
|
for _, file := range filesInIndex {
|
||||||
if file == opts.NewTreeName {
|
if file == opts.NewTreeName {
|
||||||
return models.ErrRepoFileAlreadyExist{FileName: opts.NewTreeName}
|
return models.ErrRepoFileAlreadyExist{FileName: opts.NewTreeName}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
gitRepo, err := git.OpenRepository(t.basePath)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
tree, err := gitRepo.GetTree("HEAD")
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
entry, err := tree.GetTreeEntryByPath(opts.OldTreeName)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
encoding, bom = detectEncodingAndBOM(entry, repo)
|
||||||
}
|
}
|
||||||
|
|
||||||
//var stdout string
|
//var stdout string
|
||||||
|
@ -72,9 +159,28 @@ func UpdateRepoFile(repo *models.Repository, doer *models.User, opts *UpdateRepo
|
||||||
}
|
}
|
||||||
|
|
||||||
content := opts.Content
|
content := opts.Content
|
||||||
|
if bom {
|
||||||
|
content = string(base.UTF8BOM) + content
|
||||||
|
}
|
||||||
|
if encoding != "UTF-8" {
|
||||||
|
charsetEncoding, _ := charset.Lookup(encoding)
|
||||||
|
if charsetEncoding != nil {
|
||||||
|
result, _, err := transform.String(charsetEncoding.NewEncoder(), string(content))
|
||||||
|
if err != nil {
|
||||||
|
// Look if we can't encode back in to the original we should just stick with utf-8
|
||||||
|
log.Error(4, "Error re-encoding %s (%s) as %s - will stay as UTF-8: %v", opts.NewTreeName, opts.OldTreeName, encoding, err)
|
||||||
|
result = content
|
||||||
|
}
|
||||||
|
content = result
|
||||||
|
} else {
|
||||||
|
log.Error(4, "Unknown encoding: %s", encoding)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Reset the opts.Content with the re-encoded and BOM'd content
|
||||||
|
opts.Content = content
|
||||||
var lfsMetaObject *models.LFSMetaObject
|
var lfsMetaObject *models.LFSMetaObject
|
||||||
|
|
||||||
if filename2attribute2info[opts.NewTreeName] != nil && filename2attribute2info[opts.NewTreeName]["filter"] == "lfs" {
|
if setting.LFS.StartServer && filename2attribute2info[opts.NewTreeName] != nil && filename2attribute2info[opts.NewTreeName]["filter"] == "lfs" {
|
||||||
// OK so we are supposed to LFS this data!
|
// OK so we are supposed to LFS this data!
|
||||||
oid, err := models.GenerateLFSOid(strings.NewReader(opts.Content))
|
oid, err := models.GenerateLFSOid(strings.NewReader(opts.Content))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
Loading…
Reference in a new issue