= 0 ? html[startIndex..] : html;
var maxCount = Math.Max(1, maxItems);
var seenUrls = new HashSet
(StringComparer.OrdinalIgnoreCase);
foreach (Match match in CnrListAnchorRegex.Matches(scope))
{
var normalizedUrl = ResolveAbsoluteUrl(match.Groups["url"].Value, listPageUri);
if (string.IsNullOrWhiteSpace(normalizedUrl) || !seenUrls.Add(normalizedUrl))
{
continue;
}
var inner = match.Groups["inner"].Value;
var title = ExtractTagInnerText(inner, "strong");
if (string.IsNullOrWhiteSpace(title))
{
continue;
}
var summary = ExtractTagInnerText(inner, "em");
var publishTime = ExtractTagInnerTextByClass(inner, "span", "publishTime");
var imageUrl = ExtractFirstImageUrl(inner, listPageUri);
yield return new DailyNewsItemSnapshot(
Title: title,
Summary: summary,
Url: normalizedUrl,
ImageUrl: imageUrl,
PublishTime: publishTime);
if (seenUrls.Count >= maxCount)
{
yield break;
}
}
}
private static string? ExtractTagInnerText(string htmlFragment, string tagName)
{
if (string.IsNullOrWhiteSpace(htmlFragment) || string.IsNullOrWhiteSpace(tagName))
{
return null;
}
var match = Regex.Match(
htmlFragment,
$"<{tagName}[^>]*>(?.*?){tagName}>",
RegexOptions.IgnoreCase | RegexOptions.Singleline);
if (!match.Success)
{
return null;
}
return NormalizeInlineText(match.Groups["value"].Value);
}
private static string? ExtractTagInnerTextByClass(string htmlFragment, string tagName, string className)
{
if (string.IsNullOrWhiteSpace(htmlFragment) ||
string.IsNullOrWhiteSpace(tagName) ||
string.IsNullOrWhiteSpace(className))
{
return null;
}
var match = Regex.Match(
htmlFragment,
$"<{tagName}[^>]*class=\"[^\"]*{Regex.Escape(className)}[^\"]*\"[^>]*>(?.*?){tagName}>",
RegexOptions.IgnoreCase | RegexOptions.Singleline);
if (!match.Success)
{
return null;
}
return NormalizeInlineText(match.Groups["value"].Value);
}
private static string? ExtractFirstImageUrl(string htmlFragment, Uri pageUri)
{
if (string.IsNullOrWhiteSpace(htmlFragment))
{
return null;
}
var matches = HtmlImageTagRegex.Matches(htmlFragment);
foreach (Match match in matches)
{
var normalized = ResolveAbsoluteUrl(match.Groups["url"].Value, pageUri);
if (IsLikelyContentImageUrl(normalized))
{
return normalized;
}
}
return null;
}
private async Task TryFetchArticleCoverImageAsync(string articleUrl, CancellationToken cancellationToken)
{
if (!Uri.TryCreate(articleUrl, UriKind.Absolute, out var articleUri))
{
return null;
}
var html = await FetchHtmlWithCnrEncodingAsync(articleUrl, cancellationToken);
var metaMatches = new[]
{
Regex.Match(
html,
"]+property=\"og:image\"[^>]+content=\"(?[^\"]+)\"",
RegexOptions.IgnoreCase),
Regex.Match(
html,
"]+name=\"image\"[^>]+content=\"(?[^\"]+)\"",
RegexOptions.IgnoreCase)
};
foreach (var metaMatch in metaMatches)
{
if (!metaMatch.Success)
{
continue;
}
var metaUrl = ResolveAbsoluteUrl(metaMatch.Groups["url"].Value, articleUri);
if (IsLikelyContentImageUrl(metaUrl))
{
return metaUrl;
}
}
var imageMatches = Regex.Matches(
html,
"
]+src=\"(?[^\"]+)\"",
RegexOptions.IgnoreCase | RegexOptions.Singleline);
foreach (Match imageMatch in imageMatches)
{
var normalized = ResolveAbsoluteUrl(imageMatch.Groups["url"].Value, articleUri);
if (IsLikelyContentImageUrl(normalized))
{
return normalized;
}
}
return null;
}
private static bool IsLikelyContentImageUrl(string? imageUrl)
{
if (string.IsNullOrWhiteSpace(imageUrl))
{
return false;
}
var value = imageUrl.Trim();
if (!(value.EndsWith(".jpg", StringComparison.OrdinalIgnoreCase) ||
value.EndsWith(".jpeg", StringComparison.OrdinalIgnoreCase) ||
value.EndsWith(".png", StringComparison.OrdinalIgnoreCase) ||
value.EndsWith(".webp", StringComparison.OrdinalIgnoreCase) ||
value.EndsWith(".avif", StringComparison.OrdinalIgnoreCase)))
{
return false;
}
return !(value.Contains("share", StringComparison.OrdinalIgnoreCase) ||
value.Contains("logo", StringComparison.OrdinalIgnoreCase) ||
value.Contains("code.png", StringComparison.OrdinalIgnoreCase));
}
private static string? ResolveAbsoluteUrl(string? rawUrl, Uri baseUri)
{
if (string.IsNullOrWhiteSpace(rawUrl))
{
return null;
}
var candidate = rawUrl.Trim();
if (candidate.Contains("'+", StringComparison.Ordinal) ||
candidate.Contains("+'", StringComparison.Ordinal))
{
return null;
}
if (candidate.StartsWith("//", StringComparison.Ordinal))
{
return $"{baseUri.Scheme}:{candidate}";
}
if (Uri.TryCreate(candidate, UriKind.Absolute, out var absoluteUri))
{
if (!string.Equals(absoluteUri.Scheme, Uri.UriSchemeHttp, StringComparison.OrdinalIgnoreCase) &&
!string.Equals(absoluteUri.Scheme, Uri.UriSchemeHttps, StringComparison.OrdinalIgnoreCase))
{
return null;
}
return absoluteUri.ToString();
}
return Uri.TryCreate(baseUri, candidate, out var relativeUri)
? relativeUri.ToString()
: null;
}
private static string NormalizeInlineText(string? text)
{
if (string.IsNullOrWhiteSpace(text))
{
return string.Empty;
}
var decoded = WebUtility.HtmlDecode(text);
var withoutTags = HtmlTagRegex.Replace(decoded ?? string.Empty, " ");
return Regex.Replace(withoutTags, "\\s+", " ").Trim();
}
private static string? ReadString(JsonElement node, params string[] path)
{
var target = TryGetNode(node, path);
if (!target.HasValue)
{
return null;
}
return target.Value.ValueKind switch
{
JsonValueKind.String => target.Value.GetString(),
JsonValueKind.Number => target.Value.GetRawText(),
JsonValueKind.True => "true",
JsonValueKind.False => "false",
_ => null
};
}
private static JsonElement? TryGetNode(JsonElement node, params string[] path)
{
var current = node;
foreach (var segment in path)
{
if (current.ValueKind != JsonValueKind.Object || !current.TryGetProperty(segment, out var next))
{
return null;
}
current = next;
}
return current;
}
private string? BuildArtworkImageUrl(string? imageId)
{
if (string.IsNullOrWhiteSpace(imageId))
{
return null;
}
return string.Format(
CultureInfo.InvariantCulture,
_options.ArtInstituteImageUrlTemplate,
imageId.Trim());
}
private string ResolveArtworkMirrorSource(DailyArtworkQuery query)
{
if (!string.IsNullOrWhiteSpace(query.MirrorSource))
{
return DailyArtworkMirrorSources.Normalize(query.MirrorSource);
}
try
{
var snapshot = _appSettingsService.Load();
return DailyArtworkMirrorSources.Normalize(snapshot.DailyArtworkMirrorSource);
}
catch
{
return DailyArtworkMirrorSources.Overseas;
}
}
private async Task FetchOverseasArtworkPayloadAsync(DateOnly localDate, CancellationToken cancellationToken)
{
var candidateCount = Math.Clamp(_options.DefaultArtworkCandidateCount, 10, 100);
var page = Math.Clamp((localDate.DayOfYear % 100) + 1, 1, 100);
var requestUrl = string.Format(
CultureInfo.InvariantCulture,
_options.ArtInstituteArtworkApiTemplate,
page,
candidateCount);
using var request = new HttpRequestMessage(HttpMethod.Get, requestUrl);
request.Headers.TryAddWithoutValidation("User-Agent", UserAgent);
using var response = await _httpClient.SendAsync(request, cancellationToken);
var responseText = await response.Content.ReadAsStringAsync(cancellationToken);
if (!response.IsSuccessStatusCode)
{
throw new HttpRequestException($"HTTP {(int)response.StatusCode}: {Truncate(responseText, 180)}");
}
return responseText;
}
private static string? BuildDomesticImageUrl(string? rawValue, string fallbackHost)
{
if (string.IsNullOrWhiteSpace(rawValue))
{
return null;
}
var candidate = rawValue.Trim();
if (Uri.TryCreate(candidate, UriKind.Absolute, out var absoluteUri))
{
return absoluteUri.ToString();
}
if (!Uri.TryCreate(fallbackHost, UriKind.Absolute, out var hostUri))
{
return null;
}
var normalizedPath = candidate.StartsWith("/", StringComparison.Ordinal) ? candidate : $"/{candidate}";
return new Uri(hostUri, normalizedPath).ToString();
}
private static string ExtractDomesticTitle(string? copyrightText)
{
if (string.IsNullOrWhiteSpace(copyrightText))
{
return string.Empty;
}
var compact = copyrightText.Trim();
var bracketIndex = compact.IndexOf('(');
if (bracketIndex <= 0)
{
return compact;
}
return compact[..bracketIndex].Trim();
}
private static string? ParseDomesticDateText(string? rawDate)
{
if (string.IsNullOrWhiteSpace(rawDate) || rawDate.Length < 8)
{
return null;
}
if (DateTime.TryParseExact(
rawDate[..8],
"yyyyMMdd",
CultureInfo.InvariantCulture,
DateTimeStyles.None,
out var date))
{
return date.ToString("yyyy-MM-dd", CultureInfo.InvariantCulture);
}
return null;
}
private static string? ReadFirstNonEmptyLine(string? text)
{
if (string.IsNullOrWhiteSpace(text))
{
return null;
}
return text
.Split(['\r', '\n'], StringSplitOptions.RemoveEmptyEntries)
.Select(line => line.Trim())
.FirstOrDefault(line => !string.IsNullOrWhiteSpace(line));
}
private static DateOnly GetChinaLocalDate()
{
var now = DateTimeOffset.UtcNow.ToOffset(TimeSpan.FromHours(8));
return DateOnly.FromDateTime(now.Date);
}
private static string Truncate(string? text, int maxLength)
{
if (string.IsNullOrEmpty(text))
{
return string.Empty;
}
return text.Length <= maxLength
? text
: $"{text[..maxLength]}...";
}
}