加上中文注释

2026-04-08 17:48:05 +08:00
parent 6c2f5ad978
commit c154abf410
11 changed files with 830 additions and 560 deletions
@@ -1,36 +1,46 @@
 // Package parser extracts title, description, text content, and links from HTML.
+// parser 包负责 HTML 解析：从网页 HTML 中提取标题、描述、正文和所有超链接。
 package parser

 import (
-	"path"
-	"regexp"
-	"strings"
+	"path"                              // 路径处理（提取目录、规范化相对路径）
+	"regexp"                            // 正则表达式（空白字符替换）
+	"strings"                           // 字符串操作

-	"golang.org/x/net/html"
+	"golang.org/x/net/html"             // 标准 HTML 解析器（将 HTML 解析为 DOM 树）
 )

+// wsRe 空白字符正则：将任意连续空白字符（空格、换行、制表符等）替换为单个空格。
 var wsRe = regexp.MustCompile(`\s+`)

-// ParseHTML parses an HTML document and returns title, meta description, body text, and href list.
+// ParseHTML 解析 HTML 文档，返回标题、meta 描述、正文文本和所有超链接列表。
+// body：原始 HTML 字符串；baseURL：用于解析相对链接的基准 URL。
 func ParseHTML(body, baseURL string) (title, description, text string, hrefs []string) {
-	// Determine base scheme+host
+	// 从 baseURL 提取基准协议和主机（如 "https://example.com"）
 	base := baseFromURL(baseURL)
+	// 从 baseURL 提取当前页面路径（如 "/path/page.html"）
 	basePath := pathFromURL(baseURL)

+	// 将 HTML 字符串解析为 DOM 树
 	doc, err := html.Parse(strings.NewReader(body))
 	if err != nil {
-		return
+		return // 解析失败返回空
 	}

-	var textParts []string
+	var textParts []string // 收集所有正文文本片段

+	// 深度优先遍历 DOM 树
 	var dfs func(n *html.Node)
 	dfs = func(n *html.Node) {
 		if n.Type == html.ElementNode {
 			tag := strings.ToLower(n.Data)
+
+			// 跳过 <script>、<style>、<svg> 等无需解析内容的标签
 			if tag == "script" || tag == "style" || tag == "svg" {
 				return
 			}
+
+			// 提取 <meta name="description" content="..."> 标签
 			if tag == "meta" {
 				name := ""
 				content := ""
@@ -42,15 +52,20 @@ func ParseHTML(body, baseURL string) (title, description, text string, hrefs []s
 						content = a.Val
 					}
 				}
+				// 只取第一个描述
 				if name == "description" && description == "" {
 					description = content
 				}
 			}
+
+			// 提取 <a href="..."> 链接
 			if tag == "a" {
 				href := attrVal(n, "href")
 				if href != "" {
+					// 去除 URL 中的锚点（#fragment）
 					href = strings.SplitN(href, "#", 2)[0]
 					if href != "" {
+						// 解析为绝对 URL（处理相对路径、协议相对路径等）
 						href = resolveURL(base, basePath, href)
 						if href != "" {
 							hrefs = append(hrefs, href)
@@ -60,36 +75,42 @@ func ParseHTML(body, baseURL string) (title, description, text string, hrefs []s
 			}
 		}

+		// 提取文本节点（<title> 和正文内容）
 		if n.Type == html.TextNode && n.Parent != nil {
 			parentTag := ""
 			if n.Parent.Type == html.ElementNode {
 				parentTag = strings.ToLower(n.Parent.Data)
 			}
+			// 跳过 script/style/svg 内的文本
 			if parentTag == "script" || parentTag == "style" || parentTag == "svg" {
 				goto children
 			}
+			// 空白压缩并去除首尾空格
 			s := wsRe.ReplaceAllString(n.Data, " ")
 			s = strings.TrimSpace(s)
 			if s != "" {
 				if parentTag == "title" {
-					title = s
+					title = s // 标题只取第一个
 				} else {
-					textParts = append(textParts, s)
+					textParts = append(textParts, s) // 正文片段收集
 				}
 			}
 		}

 	children:
+		// 递归遍历子节点
 		for c := n.FirstChild; c != nil; c = c.NextSibling {
 			dfs(c)
 		}
 	}
 	dfs(doc)

+	// 将正文片段用空格连接为完整文本
 	text = strings.Join(textParts, " ")
 	return
 }

+// attrVal 提取 HTML 节点上指定名称的属性值（不区分大小写）。
 func attrVal(n *html.Node, key string) string {
 	for _, a := range n.Attr {
 		if strings.ToLower(a.Key) == key {
@@ -99,6 +120,8 @@ func attrVal(n *html.Node, key string) string {
 	return ""
 }

+// baseFromURL 从原始 URL 提取 "scheme://host" 部分（不含路径）。
+// 例如："https://example.com/path/page" → "https://example.com"。
 func baseFromURL(rawURL string) string {
 	idx := strings.Index(rawURL, "://")
 	if idx < 0 {
@@ -107,11 +130,13 @@ func baseFromURL(rawURL string) string {
 	rest := rawURL[idx+3:]
 	slash := strings.Index(rest, "/")
 	if slash < 0 {
-		return rawURL
+		return rawURL // 无路径，直接返回整个 URL
 	}
 	return rawURL[:idx+3+slash]
 }

+// pathFromURL 从原始 URL 提取路径部分（不含域名）。
+// 例如："https://example.com/path/page?q=1#top" → "/path/page"。
 func pathFromURL(rawURL string) string {
 	idx := strings.Index(rawURL, "://")
 	if idx < 0 {
@@ -122,32 +147,33 @@ func pathFromURL(rawURL string) string {
 	if slash < 0 {
 		return "/"
 	}
-	p := rest[slash:]
-	// strip query/fragment
+	p := rest[slash:] // 从第一个斜杠开始即为路径
+	// 去除查询字符串和锚点
 	p = strings.SplitN(p, "?", 2)[0]
 	p = strings.SplitN(p, "#", 2)[0]
 	return p
 }

+// resolveURL 将相对 href 解析为绝对 URL，参考 base（协议+主机）和 basePath（当前页面路径）。
+// 支持：http://、https:// 绝对 URL；// 协议相对 URL；/ 绝对路径；相对路径。
 func resolveURL(base, basePath, href string) string {
-	// Absolute URL
+	// 已经是绝对 URL，直接返回
 	if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
 		return href
 	}
-	// Protocol-relative
+	// 协议相对 URL（以 // 开头），补上协议
 	if strings.HasPrefix(href, "//") {
-		// extract scheme from base
 		idx := strings.Index(base, "://")
 		if idx < 0 {
 			return ""
 		}
 		return base[:idx+1] + href
 	}
-	// Absolute path
+	// 绝对路径（以 / 开头），拼接域名
 	if strings.HasPrefix(href, "/") {
 		return base + href
 	}
-	// Relative path
-	dir := path.Dir(basePath)
-	return base + path.Clean(dir+"/"+href)
+	// 相对路径：基于当前页面目录拼接
+	dir := path.Dir(basePath) // 提取当前页面的目录部分
+	return base + path.Clean(dir+"/"+href) // path.Clean 规范化，去除多余的 ../ 等
 }