Skip to content

Commit

Permalink
feat!: search improvements (#10113)
Browse files Browse the repository at this point in the history
* feat!: search improvements

* Make it opt-in via _searchIndexUseMetadata

---------

Co-authored-by: Yufei Huang <[email protected]>
  • Loading branch information
frarees and yufeih authored Feb 20, 2025
1 parent b0f5472 commit 850a61c
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 22 deletions.
42 changes: 42 additions & 0 deletions src/Docfx.Build.ManagedReference/FillMetadata.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections.Immutable;
using System.Composition;
using Docfx.Build.Common;
using Docfx.DataContracts.ManagedReference;
using Docfx.Plugins;

namespace Docfx.Build.ManagedReference;

[Export(nameof(ManagedReferenceDocumentProcessor), typeof(IDocumentBuildStep))]
public class FillMetadata : BaseDocumentBuildStep
{
public override string Name => nameof(FillMetadata);
public override int BuildOrder => 0x30;

public override void Postbuild(ImmutableList<FileModel> models, IHostService host)
{
if (models.Count > 0)
{
foreach (var model in models)
{
if (model.Type != DocumentType.Article)
{
continue;
}

model.ManifestProperties.Uid = null;
var pageViewModel = (PageViewModel)model.Content;
if (pageViewModel.Items.Count == 0)
{
continue;
}

model.ManifestProperties.IsMRef = true;
model.ManifestProperties.Title = pageViewModel.Items[0].FullName;
model.ManifestProperties.Summary = pageViewModel.Items[0].Summary;
}
}
}
}
89 changes: 81 additions & 8 deletions src/Docfx.Build/PostProcessors/ExtractSearchIndex.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Licensed to the .NET Foundation under one or more agreements.
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Collections.Immutable;
Expand All @@ -18,6 +18,8 @@ partial class ExtractSearchIndex : IPostProcessor
[GeneratedRegex(@"\s+")]
private static partial Regex s_regexWhiteSpace();

private static readonly Regex s_regexCase = new(@"[a-z0-9]+|[A-Z0-9]+[a-z0-9]*|[0-9]+", RegexOptions.Compiled);

private static readonly HashSet<string> s_htmlInlineTags = new(StringComparer.OrdinalIgnoreCase)
{
"a", "area", "del", "ins", "link", "map", "meta", "abbr", "audio", "b", "bdo", "button", "canvas", "cite", "code", "command", "data",
Expand All @@ -29,12 +31,20 @@ partial class ExtractSearchIndex : IPostProcessor
public string Name => nameof(ExtractSearchIndex);
public const string IndexFileName = "index.json";

internal bool UseMetadata { get; set; } = false;
internal bool UseMetadataTitle { get; set; } = true;

public ImmutableDictionary<string, object> PrepareMetadata(ImmutableDictionary<string, object> metadata)
{
if (!metadata.ContainsKey("_enableSearch"))
{
metadata = metadata.Add("_enableSearch", true);
}

UseMetadata = metadata.TryGetValue("_searchIndexUseMetadata", out var useMetadataObject) && (bool)useMetadataObject;
UseMetadataTitle = !metadata.TryGetValue("_searchIndexUseMetadataTitle", out var useMetadataTitleObject) || (bool)useMetadataTitleObject;

Logger.LogInfo($"{Name}: {nameof(UseMetadata)} = {UseMetadata}, {nameof(UseMetadataTitle)} = {UseMetadataTitle}");
return metadata;
}

Expand All @@ -49,14 +59,15 @@ public Manifest Process(Manifest manifest, string outputFolder, CancellationToke
var htmlFiles = (from item in manifest.Files ?? Enumerable.Empty<ManifestItem>()
from output in item.Output
where item.Type != "Toc" && output.Key.Equals(".html", StringComparison.OrdinalIgnoreCase)
select output.Value.RelativePath).ToList();
select (output.Value.RelativePath, item.Metadata)).ToList();

if (htmlFiles.Count == 0)
{
return manifest;
}

Logger.LogInfo($"Extracting index data from {htmlFiles.Count} html files");
foreach (var relativePath in htmlFiles)
foreach ((string relativePath, Dictionary<string, object> metadata) in htmlFiles)
{
cancellationToken.ThrowIfCancellationRequested();

Expand All @@ -76,7 +87,7 @@ from output in item.Output
Logger.LogWarning($"Warning: Can't load content from {filePath}: {ex.Message}");
continue;
}
var indexItem = ExtractItem(html, relativePath);
var indexItem = ExtractItem(html, relativePath, metadata);
if (indexItem != null)
{
indexData[relativePath] = indexItem;
Expand All @@ -99,7 +110,7 @@ from output in item.Output
return manifest;
}

internal SearchIndexItem ExtractItem(HtmlDocument html, string href)
internal SearchIndexItem ExtractItem(HtmlDocument html, string href, Dictionary<string, object> metadata = null)
{
var contentBuilder = new StringBuilder();

Expand All @@ -117,10 +128,37 @@ internal SearchIndexItem ExtractItem(HtmlDocument html, string href)
ExtractTextFromNode(node, contentBuilder);
}

var content = NormalizeContent(contentBuilder.ToString());
var title = ExtractTitleFromHtml(html);
string title;
string summary = null;
string keywords = null;

return new SearchIndexItem { Href = href, Title = title, Keywords = content };
var isMRef = metadata != null && metadata.TryGetValue("IsMRef", out var isMRefMetadata) && (bool)isMRefMetadata;
if (UseMetadata && isMRef)
{
title = UseMetadataTitle
? (string)metadata["Title"] ?? ExtractTitleFromHtml(html)
: ExtractTitleFromHtml(html);

var htmlSummary = (string)metadata["Summary"];
if (!string.IsNullOrEmpty(htmlSummary))
{
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(htmlSummary);
var htmlRootNode = htmlDocument.DocumentNode.FirstChild;
var summaryBuilder = new StringBuilder();
ExtractTextFromNode(htmlRootNode, summaryBuilder);
summary = NormalizeContent(summaryBuilder.ToString());
}

keywords = string.Join(' ', title.Split(' ').Select(word => string.Join(' ', GetStemAggregations(word.Split('.')[^1]))));
}
else
{
title = ExtractTitleFromHtml(html);
summary = NormalizeContent(contentBuilder.ToString());
}

return new SearchIndexItem { Href = href, Title = title, Summary = summary, Keywords = keywords };
}

private static string ExtractTitleFromHtml(HtmlDocument html)
Expand All @@ -140,6 +178,41 @@ private static string NormalizeContent(string str)
return s_regexWhiteSpace().Replace(str, " ").Trim();
}

private static string[] GetStems(string str)
{
if (string.IsNullOrEmpty(str))
{
return [string.Empty];
}
str = WebUtility.HtmlDecode(str);
return s_regexCase.Matches(str).Select(m => m.Value).ToArray();
}

private static List<string> GetStemAggregations(string str)
{
var stems = GetStems(str);

var results = new List<string>();
Aggregate(stems, [], results, 0);
return results;

static void Aggregate(string[] input, List<string> current, List<string> results, int index)
{
if (index == input.Length)
{
return;
}

for (int i = index; i < input.Length; i++)
{
current.Add(input[i]);
results.Add(string.Join(string.Empty, current));
Aggregate(input, current, results, i + 1);
current.RemoveAt(current.Count - 1);
}
}
}

private static void ExtractTextFromNode(HtmlNode node, StringBuilder contentBuilder)
{
if (node == null)
Expand Down
14 changes: 12 additions & 2 deletions src/Docfx.Build/PostProcessors/SearchIndexItem.cs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ class SearchIndexItem
[JsonPropertyName("keywords")]
public string Keywords { get; set; }

[JsonProperty("summary")]
[JsonPropertyName("summary")]
public string Summary { get; set; }

public override bool Equals(object obj)
{
return Equals(obj as SearchIndexItem);
Expand All @@ -35,11 +39,17 @@ public bool Equals(SearchIndexItem other)
{
return true;
}
return string.Equals(Title, other.Title) && string.Equals(Href, other.Href) && string.Equals(Keywords, other.Keywords);
return string.Equals(Title, other.Title) &&
string.Equals(Href, other.Href) &&
string.Equals(Summary, other.Summary) &&
string.Equals(Keywords, other.Keywords);
}

public override int GetHashCode()
{
return Title.GetHashCode() ^ Href.GetHashCode() ^ Keywords.GetHashCode();
return Title.GetHashCode() ^
Href.GetHashCode() ^
Summary.GetHashCode() ^
Keywords.GetHashCode();
}
}
5 changes: 3 additions & 2 deletions templates/default/src/search-worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
var results = [];
hits.forEach(function (hit) {
var item = searchData[hit.ref];
results.push({ 'href': item.href, 'title': item.title, 'keywords': item.keywords });
results.push({ 'href': item.href, 'title': item.title, 'summary': item.summary, 'keywords': item.keywords });
});
postMessage({ e: 'query-ready', q: q, d: results });
}
Expand All @@ -51,7 +51,8 @@
this.pipeline.remove(lunr.stopWordFilter);
this.ref('href');
this.field('title', { boost: 50 });
this.field('keywords', { boost: 20 });
this.field('keywords', { boost: 40 });
this.field('summary', { boost: 20 });

for (var prop in searchData) {
if (searchData.hasOwnProperty(prop)) {
Expand Down
5 changes: 4 additions & 1 deletion templates/default/styles/docfx.js
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,9 @@ $(function () {
}

function extractContentBrief(content) {
if (!content) {
return
}
var briefOffset = 512;
var words = query.split(/\s+/g);
var queryIndex = content.indexOf(words[0]);
Expand Down Expand Up @@ -285,7 +288,7 @@ $(function () {
var itemRawHref = relativeUrlToAbsoluteUrl(currentUrl, relHref + hit.href);
var itemHref = relHref + hit.href + "?q=" + query;
var itemTitle = hit.title;
var itemBrief = extractContentBrief(hit.keywords);
var itemBrief = extractContentBrief(hit.summary || '');

var itemNode = $('<div>').attr('class', 'sr-item');
var itemTitleNode = $('<div>').attr('class', 'item-title').append($('<a>').attr('href', itemHref).attr("target", "_blank").attr("rel", "noopener noreferrer").text(itemTitle));
Expand Down
4 changes: 3 additions & 1 deletion templates/modern/src/search-worker.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import { get, set, createStore } from 'idb-keyval'
type SearchHit = {
href: string
title: string
summary: string
keywords: string
}

Expand Down Expand Up @@ -47,7 +48,8 @@ async function loadIndex({ lunrLanguages }: { lunrLanguages?: string[] }) {

this.ref('href')
this.field('title', { boost: 50 })
this.field('keywords', { boost: 20 })
this.field('keywords', { boost: 40 })
this.field('summary', { boost: 20 })

if (lunrLanguages && lunrLanguages.length > 0) {
this.use(lunr.multiLanguage(...lunrLanguages))
Expand Down
11 changes: 9 additions & 2 deletions templates/modern/src/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { classMap } from 'lit-html/directives/class-map.js'
type SearchHit = {
href: string
title: string
summary: string
keywords: string
}

Expand All @@ -34,6 +35,11 @@ export async function enableSearch() {
case 'index-ready':
searchQuery.disabled = false
searchQuery.addEventListener('input', onSearchQueryInput)
searchQuery.addEventListener('keypress', function(e) {
if (e.key === 'Enter') {
event.preventDefault()
}
})
window.docfx.searchReady = true
break
case 'query-ready':
Expand All @@ -56,7 +62,8 @@ export async function enableSearch() {
if (query === '') {
document.body.removeAttribute('data-search')
} else {
worker.postMessage({ q: query })
const additiveQuery = query.replace(/\s+/g, ' ').split(' ').map(w => '+' + w).join(' ')
worker.postMessage({ q: additiveQuery })
}
}

Expand Down Expand Up @@ -108,7 +115,7 @@ export async function enableSearch() {
const currentUrl = window.location.href
const itemRawHref = relativeUrlToAbsoluteUrl(currentUrl, relHref + hit.href)
const itemHref = relHref + hit.href + '?q=' + query
const itemBrief = extractContentBrief(hit.keywords)
const itemBrief = hit.summary ? extractContentBrief(hit.summary) : ''
return html`
<div class="sr-item">
Expand Down
Loading

0 comments on commit 850a61c

Please sign in to comment.