From 850a61c2ecb83b0bda21448f1393e480934556fd Mon Sep 17 00:00:00 2001 From: Francisco Requena Date: Thu, 20 Feb 2025 04:48:04 +0100 Subject: [PATCH] feat!: search improvements (#10113) * feat!: search improvements * Make it opt-in via _searchIndexUseMetadata --------- Co-authored-by: Yufei Huang --- .../FillMetadata.cs | 42 +++++++++ .../PostProcessors/ExtractSearchIndex.cs | 89 +++++++++++++++++-- .../PostProcessors/SearchIndexItem.cs | 14 ++- templates/default/src/search-worker.js | 5 +- templates/default/styles/docfx.js | 5 +- templates/modern/src/search-worker.ts | 4 +- templates/modern/src/search.ts | 11 ++- .../ExtractSearchIndexFromHtmlTest.cs | 77 ++++++++++++++-- 8 files changed, 225 insertions(+), 22 deletions(-) create mode 100644 src/Docfx.Build.ManagedReference/FillMetadata.cs diff --git a/src/Docfx.Build.ManagedReference/FillMetadata.cs b/src/Docfx.Build.ManagedReference/FillMetadata.cs new file mode 100644 index 00000000000..e35ea0f0725 --- /dev/null +++ b/src/Docfx.Build.ManagedReference/FillMetadata.cs @@ -0,0 +1,42 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +using System.Collections.Immutable; +using System.Composition; +using Docfx.Build.Common; +using Docfx.DataContracts.ManagedReference; +using Docfx.Plugins; + +namespace Docfx.Build.ManagedReference; + +[Export(nameof(ManagedReferenceDocumentProcessor), typeof(IDocumentBuildStep))] +public class FillMetadata : BaseDocumentBuildStep +{ + public override string Name => nameof(FillMetadata); + public override int BuildOrder => 0x30; + + public override void Postbuild(ImmutableList models, IHostService host) + { + if (models.Count > 0) + { + foreach (var model in models) + { + if (model.Type != DocumentType.Article) + { + continue; + } + + model.ManifestProperties.Uid = null; + var pageViewModel = (PageViewModel)model.Content; + if (pageViewModel.Items.Count == 0) + { + continue; + } + + model.ManifestProperties.IsMRef = true; + model.ManifestProperties.Title = pageViewModel.Items[0].FullName; + model.ManifestProperties.Summary = pageViewModel.Items[0].Summary; + } + } + } +} diff --git a/src/Docfx.Build/PostProcessors/ExtractSearchIndex.cs b/src/Docfx.Build/PostProcessors/ExtractSearchIndex.cs index 95b236d0e7c..ff8a7f9470d 100644 --- a/src/Docfx.Build/PostProcessors/ExtractSearchIndex.cs +++ b/src/Docfx.Build/PostProcessors/ExtractSearchIndex.cs @@ -1,4 +1,4 @@ -// Licensed to the .NET Foundation under one or more agreements. +// Licensed to the .NET Foundation under one or more agreements. // The .NET Foundation licenses this file to you under the MIT license. using System.Collections.Immutable; @@ -18,6 +18,8 @@ partial class ExtractSearchIndex : IPostProcessor [GeneratedRegex(@"\s+")] private static partial Regex s_regexWhiteSpace(); + private static readonly Regex s_regexCase = new(@"[a-z0-9]+|[A-Z0-9]+[a-z0-9]*|[0-9]+", RegexOptions.Compiled); + private static readonly HashSet s_htmlInlineTags = new(StringComparer.OrdinalIgnoreCase) { "a", "area", "del", "ins", "link", "map", "meta", "abbr", "audio", "b", "bdo", "button", "canvas", "cite", "code", "command", "data", @@ -29,12 +31,20 @@ partial class ExtractSearchIndex : IPostProcessor public string Name => nameof(ExtractSearchIndex); public const string IndexFileName = "index.json"; + internal bool UseMetadata { get; set; } = false; + internal bool UseMetadataTitle { get; set; } = true; + public ImmutableDictionary PrepareMetadata(ImmutableDictionary metadata) { if (!metadata.ContainsKey("_enableSearch")) { metadata = metadata.Add("_enableSearch", true); } + + UseMetadata = metadata.TryGetValue("_searchIndexUseMetadata", out var useMetadataObject) && (bool)useMetadataObject; + UseMetadataTitle = !metadata.TryGetValue("_searchIndexUseMetadataTitle", out var useMetadataTitleObject) || (bool)useMetadataTitleObject; + + Logger.LogInfo($"{Name}: {nameof(UseMetadata)} = {UseMetadata}, {nameof(UseMetadataTitle)} = {UseMetadataTitle}"); return metadata; } @@ -49,14 +59,15 @@ public Manifest Process(Manifest manifest, string outputFolder, CancellationToke var htmlFiles = (from item in manifest.Files ?? Enumerable.Empty() from output in item.Output where item.Type != "Toc" && output.Key.Equals(".html", StringComparison.OrdinalIgnoreCase) - select output.Value.RelativePath).ToList(); + select (output.Value.RelativePath, item.Metadata)).ToList(); + if (htmlFiles.Count == 0) { return manifest; } Logger.LogInfo($"Extracting index data from {htmlFiles.Count} html files"); - foreach (var relativePath in htmlFiles) + foreach ((string relativePath, Dictionary metadata) in htmlFiles) { cancellationToken.ThrowIfCancellationRequested(); @@ -76,7 +87,7 @@ from output in item.Output Logger.LogWarning($"Warning: Can't load content from {filePath}: {ex.Message}"); continue; } - var indexItem = ExtractItem(html, relativePath); + var indexItem = ExtractItem(html, relativePath, metadata); if (indexItem != null) { indexData[relativePath] = indexItem; @@ -99,7 +110,7 @@ from output in item.Output return manifest; } - internal SearchIndexItem ExtractItem(HtmlDocument html, string href) + internal SearchIndexItem ExtractItem(HtmlDocument html, string href, Dictionary metadata = null) { var contentBuilder = new StringBuilder(); @@ -117,10 +128,37 @@ internal SearchIndexItem ExtractItem(HtmlDocument html, string href) ExtractTextFromNode(node, contentBuilder); } - var content = NormalizeContent(contentBuilder.ToString()); - var title = ExtractTitleFromHtml(html); + string title; + string summary = null; + string keywords = null; - return new SearchIndexItem { Href = href, Title = title, Keywords = content }; + var isMRef = metadata != null && metadata.TryGetValue("IsMRef", out var isMRefMetadata) && (bool)isMRefMetadata; + if (UseMetadata && isMRef) + { + title = UseMetadataTitle + ? (string)metadata["Title"] ?? ExtractTitleFromHtml(html) + : ExtractTitleFromHtml(html); + + var htmlSummary = (string)metadata["Summary"]; + if (!string.IsNullOrEmpty(htmlSummary)) + { + var htmlDocument = new HtmlDocument(); + htmlDocument.LoadHtml(htmlSummary); + var htmlRootNode = htmlDocument.DocumentNode.FirstChild; + var summaryBuilder = new StringBuilder(); + ExtractTextFromNode(htmlRootNode, summaryBuilder); + summary = NormalizeContent(summaryBuilder.ToString()); + } + + keywords = string.Join(' ', title.Split(' ').Select(word => string.Join(' ', GetStemAggregations(word.Split('.')[^1])))); + } + else + { + title = ExtractTitleFromHtml(html); + summary = NormalizeContent(contentBuilder.ToString()); + } + + return new SearchIndexItem { Href = href, Title = title, Summary = summary, Keywords = keywords }; } private static string ExtractTitleFromHtml(HtmlDocument html) @@ -140,6 +178,41 @@ private static string NormalizeContent(string str) return s_regexWhiteSpace().Replace(str, " ").Trim(); } + private static string[] GetStems(string str) + { + if (string.IsNullOrEmpty(str)) + { + return [string.Empty]; + } + str = WebUtility.HtmlDecode(str); + return s_regexCase.Matches(str).Select(m => m.Value).ToArray(); + } + + private static List GetStemAggregations(string str) + { + var stems = GetStems(str); + + var results = new List(); + Aggregate(stems, [], results, 0); + return results; + + static void Aggregate(string[] input, List current, List results, int index) + { + if (index == input.Length) + { + return; + } + + for (int i = index; i < input.Length; i++) + { + current.Add(input[i]); + results.Add(string.Join(string.Empty, current)); + Aggregate(input, current, results, i + 1); + current.RemoveAt(current.Count - 1); + } + } + } + private static void ExtractTextFromNode(HtmlNode node, StringBuilder contentBuilder) { if (node == null) diff --git a/src/Docfx.Build/PostProcessors/SearchIndexItem.cs b/src/Docfx.Build/PostProcessors/SearchIndexItem.cs index f3f8ff7574d..327abf4d3b7 100644 --- a/src/Docfx.Build/PostProcessors/SearchIndexItem.cs +++ b/src/Docfx.Build/PostProcessors/SearchIndexItem.cs @@ -20,6 +20,10 @@ class SearchIndexItem [JsonPropertyName("keywords")] public string Keywords { get; set; } + [JsonProperty("summary")] + [JsonPropertyName("summary")] + public string Summary { get; set; } + public override bool Equals(object obj) { return Equals(obj as SearchIndexItem); @@ -35,11 +39,17 @@ public bool Equals(SearchIndexItem other) { return true; } - return string.Equals(Title, other.Title) && string.Equals(Href, other.Href) && string.Equals(Keywords, other.Keywords); + return string.Equals(Title, other.Title) && + string.Equals(Href, other.Href) && + string.Equals(Summary, other.Summary) && + string.Equals(Keywords, other.Keywords); } public override int GetHashCode() { - return Title.GetHashCode() ^ Href.GetHashCode() ^ Keywords.GetHashCode(); + return Title.GetHashCode() ^ + Href.GetHashCode() ^ + Summary.GetHashCode() ^ + Keywords.GetHashCode(); } } diff --git a/templates/default/src/search-worker.js b/templates/default/src/search-worker.js index 77f8250ce86..91bff74279f 100644 --- a/templates/default/src/search-worker.js +++ b/templates/default/src/search-worker.js @@ -40,7 +40,7 @@ var results = []; hits.forEach(function (hit) { var item = searchData[hit.ref]; - results.push({ 'href': item.href, 'title': item.title, 'keywords': item.keywords }); + results.push({ 'href': item.href, 'title': item.title, 'summary': item.summary, 'keywords': item.keywords }); }); postMessage({ e: 'query-ready', q: q, d: results }); } @@ -51,7 +51,8 @@ this.pipeline.remove(lunr.stopWordFilter); this.ref('href'); this.field('title', { boost: 50 }); - this.field('keywords', { boost: 20 }); + this.field('keywords', { boost: 40 }); + this.field('summary', { boost: 20 }); for (var prop in searchData) { if (searchData.hasOwnProperty(prop)) { diff --git a/templates/default/styles/docfx.js b/templates/default/styles/docfx.js index 5bd62e28478..399435091f2 100644 --- a/templates/default/styles/docfx.js +++ b/templates/default/styles/docfx.js @@ -250,6 +250,9 @@ $(function () { } function extractContentBrief(content) { + if (!content) { + return + } var briefOffset = 512; var words = query.split(/\s+/g); var queryIndex = content.indexOf(words[0]); @@ -285,7 +288,7 @@ $(function () { var itemRawHref = relativeUrlToAbsoluteUrl(currentUrl, relHref + hit.href); var itemHref = relHref + hit.href + "?q=" + query; var itemTitle = hit.title; - var itemBrief = extractContentBrief(hit.keywords); + var itemBrief = extractContentBrief(hit.summary || ''); var itemNode = $('
').attr('class', 'sr-item'); var itemTitleNode = $('
').attr('class', 'item-title').append($('').attr('href', itemHref).attr("target", "_blank").attr("rel", "noopener noreferrer").text(itemTitle)); diff --git a/templates/modern/src/search-worker.ts b/templates/modern/src/search-worker.ts index acaeee140c4..bbadfa433b7 100644 --- a/templates/modern/src/search-worker.ts +++ b/templates/modern/src/search-worker.ts @@ -10,6 +10,7 @@ import { get, set, createStore } from 'idb-keyval' type SearchHit = { href: string title: string + summary: string keywords: string } @@ -47,7 +48,8 @@ async function loadIndex({ lunrLanguages }: { lunrLanguages?: string[] }) { this.ref('href') this.field('title', { boost: 50 }) - this.field('keywords', { boost: 20 }) + this.field('keywords', { boost: 40 }) + this.field('summary', { boost: 20 }) if (lunrLanguages && lunrLanguages.length > 0) { this.use(lunr.multiLanguage(...lunrLanguages)) diff --git a/templates/modern/src/search.ts b/templates/modern/src/search.ts index aef1c540b61..5eebcc9d413 100644 --- a/templates/modern/src/search.ts +++ b/templates/modern/src/search.ts @@ -8,6 +8,7 @@ import { classMap } from 'lit-html/directives/class-map.js' type SearchHit = { href: string title: string + summary: string keywords: string } @@ -34,6 +35,11 @@ export async function enableSearch() { case 'index-ready': searchQuery.disabled = false searchQuery.addEventListener('input', onSearchQueryInput) + searchQuery.addEventListener('keypress', function(e) { + if (e.key === 'Enter') { + event.preventDefault() + } + }) window.docfx.searchReady = true break case 'query-ready': @@ -56,7 +62,8 @@ export async function enableSearch() { if (query === '') { document.body.removeAttribute('data-search') } else { - worker.postMessage({ q: query }) + const additiveQuery = query.replace(/\s+/g, ' ').split(' ').map(w => '+' + w).join(' ') + worker.postMessage({ q: additiveQuery }) } } @@ -108,7 +115,7 @@ export async function enableSearch() { const currentUrl = window.location.href const itemRawHref = relativeUrlToAbsoluteUrl(currentUrl, relHref + hit.href) const itemHref = relHref + hit.href + '?q=' + query - const itemBrief = extractContentBrief(hit.keywords) + const itemBrief = hit.summary ? extractContentBrief(hit.summary) : '' return html`
diff --git a/test/Docfx.Build.Tests/ExtractSearchIndexFromHtmlTest.cs b/test/Docfx.Build.Tests/ExtractSearchIndexFromHtmlTest.cs index 41b97201b79..478ce955538 100644 --- a/test/Docfx.Build.Tests/ExtractSearchIndexFromHtmlTest.cs +++ b/test/Docfx.Build.Tests/ExtractSearchIndexFromHtmlTest.cs @@ -39,7 +39,72 @@ This is article title html.LoadHtml(rawHtml); var href = "http://dotnet.github.io/docfx"; var item = _extractor.ExtractItem(html, href); - Assert.Equal(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Keywords = "Hello World, Microsoft This is article title docfx can do anything..." }, item); + Assert.Equal(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Summary = "Hello World, Microsoft This is article title docfx can do anything..." }, item); + } + + [Fact] + public void TestMRefMetadata() + { + var rawHtml = @" + + + This is title in head metadata + + +

This is Title

+

Hello World, + Microsoft +

+
+

+ This is article title +

+ docfx can do anything... +
+ + +"; + var html = new HtmlDocument(); + html.LoadHtml(rawHtml); + var href = "http://dotnet.github.io/docfx"; + _extractor.UseMetadata = false; + _extractor.UseMetadataTitle = false; + var itemNoMetadata = _extractor.ExtractItem(html, href, new() + { + ["IsMRef"] = true, + ["Title"] = "ManagedReferenceExample", + ["Summary"] = "Lorem Ipsum", + }); + Assert.Equal(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Summary = "Hello World, Microsoft This is article title docfx can do anything..." }, itemNoMetadata); + _extractor.UseMetadata = true; + _extractor.UseMetadataTitle = true; + var itemWithMetadata = _extractor.ExtractItem(html, href, new() + { + ["IsMRef"] = true, + ["Title"] = "ManagedReferenceExample", + ["Summary"] = "Lorem Ipsum", + }); + Assert.Equal(new SearchIndexItem + { + Href = href, + Title = "ManagedReferenceExample", + Keywords = "Managed ManagedReference ManagedReferenceExample ManagedExample Reference ReferenceExample Example", + Summary = "Lorem Ipsum" + }, itemWithMetadata); + _extractor.UseMetadataTitle = false; + var itemWithMetadataNoTitle = _extractor.ExtractItem(html, href, new() + { + ["IsMRef"] = true, + ["Title"] = "ManagedReferenceExample", + ["Summary"] = "Lorem Ipsum", + }); + Assert.Equal(new SearchIndexItem + { + Href = href, + Title = "This is title in head metadata", + Keywords = "This is title in head metadata", + Summary = "Lorem Ipsum" + }, itemWithMetadataNoTitle); } [Fact] @@ -59,7 +124,7 @@ public void TestSearchableClass() html.LoadHtml(rawHtml); var href = "http://dotnet.github.io/docfx"; var item = _extractor.ExtractItem(html, href); - Assert.Equal(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Keywords = "Cooooooool!" }, item); + Assert.Equal(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Summary = "Cooooooool!" }, item); } [Fact] @@ -107,7 +172,7 @@ Only index once. html.LoadHtml(rawHtml); var href = "http://dotnet.github.io/docfx"; var item = _extractor.ExtractItem(html, href); - Assert.Equal(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Keywords = "Only index once." }, item); + Assert.Equal(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Summary = "Only index once." }, item); } [Fact] @@ -150,7 +215,7 @@ public void TestEmptyItem() html.LoadHtml(rawHtml); var href = "http://dotnet.github.io/docfx"; var item = _extractor.ExtractItem(html, href); - Assert.Equal(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Keywords = string.Empty }, item); + Assert.Equal(new SearchIndexItem { Href = href, Title = "This is title in head metadata", Summary = string.Empty }, item); } [Fact] @@ -170,7 +235,7 @@ public void TestBlockTagsVsInlineTags() html.LoadHtml(rawHtml); var href = "http://dotnet.github.io/docfx"; var item = _extractor.ExtractItem(html, href); - Assert.Equal(new SearchIndexItem { Href = href, Title = "", Keywords = "Insert space in block level html tags Donotinsertspaceininlinehtmltags" }, item); + Assert.Equal(new SearchIndexItem { Href = href, Title = "", Summary = "Insert space in block level html tags Donotinsertspaceininlinehtmltags" }, item); } [Fact] @@ -225,7 +290,7 @@ This is article title ""index.html"": { ""href"": ""index.html"", ""title"": ""This is title in head metadata"", - ""keywords"": ""Hello World, Microsoft This is article title docfx can do anything... and it supports non-english characters like these: ãâáà êé í õôó Типы шрифтов 人物 文字"" + ""summary"": ""Hello World, Microsoft This is article title docfx can do anything... and it supports non-english characters like these: ãâáà êé í õôó Типы шрифтов 人物 文字"" } }"; var actualIndexJSON = File.ReadAllText(Path.Combine(tempTestFolder, "index.json"), Encoding.UTF8);