init: pristine aerc 0.20.0 source
This commit is contained in:
@@ -0,0 +1,129 @@
|
||||
package parse
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"io"
|
||||
"regexp"
|
||||
"sort"
|
||||
)
|
||||
|
||||
// Partial regexp to match the beginning of URLs and email addresses.
|
||||
// The remainder of the matched URLs/emails is parsed manually.
|
||||
var urlRe = regexp.MustCompile(
|
||||
`([a-z]{2,8})://` + // URL start
|
||||
`|` + // or
|
||||
`(mailto:)?[[:alnum:]_+.~/-]*[[:alnum:]]@`, // email start
|
||||
)
|
||||
|
||||
// HttpLinks searches a reader for a http link and returns a copy of the
|
||||
// reader and a slice with links.
|
||||
func HttpLinks(r io.Reader) (io.Reader, []string) {
|
||||
buf, err := io.ReadAll(r)
|
||||
if err != nil {
|
||||
return r, nil
|
||||
}
|
||||
|
||||
links := make(map[string]bool)
|
||||
b := buf
|
||||
match := urlRe.FindSubmatchIndex(b)
|
||||
for ; match != nil; match = urlRe.FindSubmatchIndex(b) {
|
||||
// Regular expressions do not really cut it here and we
|
||||
// need to detect opening/closing braces to handle
|
||||
// markdown link syntax.
|
||||
var paren, bracket, ltgt, scheme int
|
||||
var emitUrl bool
|
||||
i, j := match[0], match[1]
|
||||
b = b[i:]
|
||||
scheme = j - i
|
||||
j = scheme
|
||||
|
||||
// "inline" email without a mailto: prefix - add some extra checks for those
|
||||
inlineEmail := len(match) > 4 && match[2] == -1 && match[4] == -1
|
||||
|
||||
for !emitUrl && j < len(b) && bytes.IndexByte(urichars, b[j]) != -1 {
|
||||
switch b[j] {
|
||||
case '[':
|
||||
bracket++
|
||||
j++
|
||||
case '(':
|
||||
paren++
|
||||
j++
|
||||
case '<':
|
||||
ltgt++
|
||||
j++
|
||||
case ']':
|
||||
bracket--
|
||||
if bracket < 0 {
|
||||
emitUrl = true
|
||||
} else {
|
||||
j++
|
||||
}
|
||||
case ')':
|
||||
paren--
|
||||
if paren < 0 {
|
||||
emitUrl = true
|
||||
} else {
|
||||
j++
|
||||
}
|
||||
case '>':
|
||||
ltgt--
|
||||
if ltgt < 0 {
|
||||
emitUrl = true
|
||||
} else {
|
||||
j++
|
||||
}
|
||||
case '&':
|
||||
if inlineEmail {
|
||||
emitUrl = true
|
||||
} else {
|
||||
j++
|
||||
}
|
||||
default:
|
||||
j++
|
||||
}
|
||||
|
||||
// we don't want those in inline emails
|
||||
if inlineEmail && (paren > 0 || ltgt > 0 || bracket > 0) {
|
||||
j--
|
||||
emitUrl = true
|
||||
}
|
||||
}
|
||||
|
||||
// Heuristic to remove trailing characters that are
|
||||
// valid URL characters, but typically not at the end of
|
||||
// the URL
|
||||
for trim := true; trim && j > 0; {
|
||||
switch b[j-1] {
|
||||
case '.', ',', ':', ';', '?', '!', '"', '\'', '%':
|
||||
j--
|
||||
default:
|
||||
trim = false
|
||||
}
|
||||
}
|
||||
if j == scheme {
|
||||
// Only an URL scheme, ignore.
|
||||
b = b[j:]
|
||||
continue
|
||||
}
|
||||
url := string(b[:j])
|
||||
if inlineEmail {
|
||||
// Email address with missing mailto: scheme. Add it.
|
||||
url = "mailto:" + url
|
||||
}
|
||||
links[url] = true
|
||||
b = b[j:]
|
||||
}
|
||||
|
||||
results := make([]string, 0, len(links))
|
||||
for link := range links {
|
||||
results = append(results, link)
|
||||
}
|
||||
sort.Strings(results)
|
||||
|
||||
return bytes.NewReader(buf), results
|
||||
}
|
||||
|
||||
var urichars = []byte(
|
||||
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" +
|
||||
"0123456789-_.,~:;/?#@!$&%*+=\"'<>()[]",
|
||||
)
|
||||
Reference in New Issue
Block a user