Skip to content

Commit

Permalink
feat: extract language for code blocks correctly
Browse files Browse the repository at this point in the history
  • Loading branch information
ashishb committed Mar 24, 2024
1 parent 056c5f3 commit f16cbf3
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 2 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Written in Go.
1. [x] Custom font - defaults to Lexend
1. [x] Use draft date as a fallback date for draft posts
1. [x] Maintain the draft status for draft and pending posts
1. [ ] Migrate code blocks correctly - syntax highlighting is not working right now
1. [x] Migrate code blocks correctly - migrate existing code class information if available
1. [ ] Featured images - I tried this [WordPress plugin](https://wordpress.org/plugins/export-media-with-selected-content/) but featured images are simply not exported

## Why existing tools don't work
Expand Down
29 changes: 28 additions & 1 deletion src/wp2hugo/internal/hugogenerator/hugopage/hugo_page.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,13 @@ const _WordPressMoreTag = "<!--more-->"
const _customMoreTag = "{{< more >}}"
const _wordPressTocTag = "[toc]"

var _markdownImageLinks = regexp.MustCompile(`!\[.*?]\((.+?)\)`)
var (
_markdownImageLinks = regexp.MustCompile(`!\[.*?]\((.+?)\)`)
// E.g. <pre class="EnlighterJSRAW" data-enlighter-language="golang">
_preTagExtractor1 = regexp.MustCompile(`<pre class="EnlighterJSRAW" data-enlighter-language="([^"]+?)".*?>([\s\S]*?)</pre>`)
// E.g. <pre class="lang:bash" nums="false">
_preTagExtractor2 = regexp.MustCompile(`<pre class=".*?lang:([^" ]+).*?>([\s\S]*?)</pre>`)
)

// Extracts "src" from Hugo figure shortcode
// {{< figure align=aligncenter width=905 src="/wp-content/uploads/2023/01/Stollemeyer-castle-1024x768.jpg" alt="" >}}
Expand Down Expand Up @@ -127,6 +133,7 @@ func (page *Page) getMarkdown(provider ImageURLProvider, htmlContent string) (*s
return nil, fmt.Errorf("empty HTML content")
}
converter := getMarkdownConverter()
htmlContent = improvePreTagsWithCode(htmlContent)
htmlContent = replaceCaptionWithFigure(htmlContent)
htmlContent = replaceAWBWithParallaxBlur(provider, htmlContent)

Expand Down Expand Up @@ -164,6 +171,26 @@ func (page *Page) getMarkdown(provider ImageURLProvider, htmlContent string) (*s
return &markdown, nil
}

func improvePreTagsWithCode(htmlContent string) string {
// Replace all occurrences of "data-enlighter-language" with "language"
// Ref: https://github.com/JohannesKaufmann/html-to-markdown/blob/master/commonmark.go#L334
if strings.Contains(htmlContent, "data-enlighter-language") {
htmlContent = strings.ReplaceAll(htmlContent, `data-enlighter-language="golang"`, `data-enlighter-language="go"`)
htmlContent = strings.ReplaceAll(htmlContent, `data-enlighter-language="shell"`, `data-enlighter-language="bash"`)
htmlContent = strings.ReplaceAll(htmlContent, `data-enlighter-language="sh"`, `data-enlighter-language="bash"`)
htmlContent = strings.ReplaceAll(htmlContent, `data-enlighter-language="lang:`, `data-enlighter-language="`)
htmlContent = strings.ReplaceAll(htmlContent, `data-enlighter-language="language-`, `data-enlighter-language="`)
htmlContent = strings.ReplaceAll(htmlContent, `data-enlighter-language="raw"`, "")
htmlContent = strings.ReplaceAll(htmlContent, `data-enlighter-language="generic"`, "")
htmlContent = _preTagExtractor1.ReplaceAllString(htmlContent, `<pre><code class="$1">$2</code></pre>`)
htmlContent = strings.ReplaceAll(htmlContent, `class="EnlighterJSRAW"`, "")
}
if strings.Contains(htmlContent, "pre class=") {
htmlContent = _preTagExtractor2.ReplaceAllString(htmlContent, `<pre><code class="$1">$2</code></pre>`)
}
return htmlContent
}

// Mark code blocks with auto-detected language
// Note: https://github.com/alecthomas/chroma is fairly inaccurate in detecting languages
func highlightCode(markdown string) string {
Expand Down
20 changes: 20 additions & 0 deletions src/wp2hugo/internal/hugogenerator/hugopage/hugo_page_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package hugopage

import (
"github.com/stretchr/testify/assert"
"testing"
)

func TestPreTagExtractor2(t *testing.T) {
const example1 = `<pre class="lang:js decode:true">document.querySelector("video").playbackRate = 2.0; // For 2X speed-up</pre>`
const example2 = `<pre class="theme:solarized-dark lang:sh decode:true">echo "whatever"</pre>`
const example3 = `<pre class="lang:sh decode:true"># Sample invocation:\n</pre>`
assert.True(t, _preTagExtractor2.MatchString(example1))
assert.True(t, _preTagExtractor2.MatchString(example2))
assert.True(t, _preTagExtractor2.MatchString(example3))

result3 := _preTagExtractor2.FindAllStringSubmatch(example3, -1)
assert.Equal(t, 1, len(result3))
assert.Equal(t, 3, len(result3[0]))
assert.Equal(t, "sh", result3[0][1])
}

0 comments on commit f16cbf3

Please sign in to comment.