Signed-off-by: 吴文峰 <kevin@lmve.net>

This commit is contained in:
2026-04-08 17:29:39 +08:00
commit 6c2f5ad978
15 changed files with 3651 additions and 0 deletions
+153
View File
@@ -0,0 +1,153 @@
// Package parser extracts title, description, text content, and links from HTML.
package parser
import (
"path"
"regexp"
"strings"
"golang.org/x/net/html"
)
var wsRe = regexp.MustCompile(`\s+`)
// ParseHTML parses an HTML document and returns title, meta description, body text, and href list.
func ParseHTML(body, baseURL string) (title, description, text string, hrefs []string) {
// Determine base scheme+host
base := baseFromURL(baseURL)
basePath := pathFromURL(baseURL)
doc, err := html.Parse(strings.NewReader(body))
if err != nil {
return
}
var textParts []string
var dfs func(n *html.Node)
dfs = func(n *html.Node) {
if n.Type == html.ElementNode {
tag := strings.ToLower(n.Data)
if tag == "script" || tag == "style" || tag == "svg" {
return
}
if tag == "meta" {
name := ""
content := ""
for _, a := range n.Attr {
switch strings.ToLower(a.Key) {
case "name":
name = strings.ToLower(a.Val)
case "content":
content = a.Val
}
}
if name == "description" && description == "" {
description = content
}
}
if tag == "a" {
href := attrVal(n, "href")
if href != "" {
href = strings.SplitN(href, "#", 2)[0]
if href != "" {
href = resolveURL(base, basePath, href)
if href != "" {
hrefs = append(hrefs, href)
}
}
}
}
}
if n.Type == html.TextNode && n.Parent != nil {
parentTag := ""
if n.Parent.Type == html.ElementNode {
parentTag = strings.ToLower(n.Parent.Data)
}
if parentTag == "script" || parentTag == "style" || parentTag == "svg" {
goto children
}
s := wsRe.ReplaceAllString(n.Data, " ")
s = strings.TrimSpace(s)
if s != "" {
if parentTag == "title" {
title = s
} else {
textParts = append(textParts, s)
}
}
}
children:
for c := n.FirstChild; c != nil; c = c.NextSibling {
dfs(c)
}
}
dfs(doc)
text = strings.Join(textParts, " ")
return
}
func attrVal(n *html.Node, key string) string {
for _, a := range n.Attr {
if strings.ToLower(a.Key) == key {
return a.Val
}
}
return ""
}
func baseFromURL(rawURL string) string {
idx := strings.Index(rawURL, "://")
if idx < 0 {
return ""
}
rest := rawURL[idx+3:]
slash := strings.Index(rest, "/")
if slash < 0 {
return rawURL
}
return rawURL[:idx+3+slash]
}
func pathFromURL(rawURL string) string {
idx := strings.Index(rawURL, "://")
if idx < 0 {
return "/"
}
rest := rawURL[idx+3:]
slash := strings.Index(rest, "/")
if slash < 0 {
return "/"
}
p := rest[slash:]
// strip query/fragment
p = strings.SplitN(p, "?", 2)[0]
p = strings.SplitN(p, "#", 2)[0]
return p
}
func resolveURL(base, basePath, href string) string {
// Absolute URL
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
return href
}
// Protocol-relative
if strings.HasPrefix(href, "//") {
// extract scheme from base
idx := strings.Index(base, "://")
if idx < 0 {
return ""
}
return base[:idx+1] + href
}
// Absolute path
if strings.HasPrefix(href, "/") {
return base + href
}
// Relative path
dir := path.Dir(basePath)
return base + path.Clean(dir+"/"+href)
}