Skip to content

Commit 9ee1895

Browse files
committed
Merge pull request #14 from benbalter/gdoc-support
Gdoc support
2 parents db0c515 + ae4b60d commit 9ee1895

File tree

3 files changed

+243
-10
lines changed

3 files changed

+243
-10
lines changed

lib/word-to-markdown.rb

+24-10
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ class WordToMarkdown
1515
.MsoListParagraphCxSpMiddle
1616
.MsoListParagraphCxSpLast
1717
.MsoListParagraph
18+
li
1819
]
1920

2021
attr_reader :path, :doc
@@ -146,6 +147,20 @@ def li_selectors
146147
LI_SELECTORS.join(",")
147148
end
148149

150+
# Returns an array of all indented values
151+
def indents
152+
@indents ||= doc.css(li_selectors).map{ |el| el.indent }.uniq.sort
153+
end
154+
155+
# Determine the indent level given an indent value
156+
#
157+
# level - the true indent, e.g., 2.5 (from 2.5em)
158+
#
159+
# Returns an integer representing the indent level
160+
def indent(level)
161+
indents.find_index level
162+
end
163+
149164
# Try to make semantic markup explicit where implied by the export
150165
def semanticize!
151166

@@ -160,25 +175,24 @@ def semanticize!
160175
list_type = "ul"
161176
end
162177

178+
# calculate indent level
179+
current_indent = indent(node.indent)
180+
163181
# Determine parent node for this li, creating it if necessary
164-
if node.indent > indent_level
182+
if current_indent > indent_level || indent_level == 0 && node.parent.css(".indent#{current_indent}").empty?
165183
list = Nokogiri::XML::Node.new list_type, @doc
166-
list.classes = ["list", "indent#{node.indent}"]
167-
if node.indent == 1
168-
list.parent = node.parent
169-
else
170-
list.parent = node.parent.css(".indent#{node.indent-1} li").last
171-
end
184+
list.classes = ["list", "indent#{current_indent}"]
185+
list.parent = node.parent.css(".indent#{current_indent-1} li").last || node.parent
172186
else
173-
list = node.parent.css(".indent#{node.indent}").last
187+
list = node.parent.css(".indent#{current_indent}").last
174188
end
175189

176190
# Note our current nesting depth
177-
indent_level = node.indent
191+
indent_level = current_indent
178192

179193
# Convert list paragraphs to actual numbered and unnumbered lists
180194
node.node_name = "li"
181-
node.parent = list
195+
node.parent = list if list
182196

183197
# Scrub unicode bullets
184198
span = node.css("span:first")[1]

test/fixtures/gdoc.htm

+215
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
<html>
2+
3+
<head>
4+
<meta content="text/html; charset=UTF-8" http-equiv="content-type">
5+
<style type="text/css">
6+
.lst-kix_id6kp4jljtnx-2>li: before {
7+
content: "\0025a0 "
8+
}
9+
.lst-kix_id6kp4jljtnx-4>li: before {
10+
content: "\0025cb "
11+
}
12+
.lst-kix_id6kp4jljtnx-3>li: before {
13+
content: "\0025cf "
14+
}
15+
ul.lst-kix_id6kp4jljtnx-0 {
16+
list-style-type: none
17+
}
18+
ul.lst-kix_id6kp4jljtnx-1 {
19+
list-style-type: none
20+
}
21+
.lst-kix_id6kp4jljtnx-7>li: before {
22+
content: "\0025cb "
23+
}
24+
ul.lst-kix_id6kp4jljtnx-2 {
25+
list-style-type: none
26+
}
27+
ul.lst-kix_id6kp4jljtnx-3 {
28+
list-style-type: none
29+
}
30+
.lst-kix_id6kp4jljtnx-6>li: before {
31+
content: "\0025cf "
32+
}
33+
ul.lst-kix_id6kp4jljtnx-4 {
34+
list-style-type: none
35+
}
36+
ul.lst-kix_id6kp4jljtnx-5 {
37+
list-style-type: none
38+
}
39+
ul.lst-kix_id6kp4jljtnx-6 {
40+
list-style-type: none
41+
}
42+
ul.lst-kix_id6kp4jljtnx-8 {
43+
list-style-type: none
44+
}
45+
ul.lst-kix_id6kp4jljtnx-7 {
46+
list-style-type: none
47+
}
48+
.lst-kix_id6kp4jljtnx-1>li: before {
49+
content: "\0025cb "
50+
}
51+
.lst-kix_id6kp4jljtnx-8>li: before {
52+
content: "\0025a0 "
53+
}
54+
.lst-kix_id6kp4jljtnx-0>li: before {
55+
content: "\0025cf "
56+
}
57+
.lst-kix_id6kp4jljtnx-5>li: before {
58+
content: "\0025a0 "
59+
}
60+
ol {
61+
margin: 0;
62+
padding: 0
63+
}
64+
.c1 {
65+
widows: 2;
66+
orphans: 2;
67+
direction: ltr
68+
}
69+
.c4 {
70+
max-width: 468pt;
71+
background-color: #ffffff;
72+
padding: 72pt 72pt 72pt 72pt
73+
}
74+
.c5 {
75+
padding-left: 0pt;
76+
margin-left: 72pt
77+
}
78+
.c0 {
79+
margin: 0;
80+
padding: 0
81+
}
82+
.c3 {
83+
padding-left: 0pt;
84+
margin-left: 36pt
85+
}
86+
.c2 {
87+
height: 11pt
88+
}
89+
.title {
90+
widows: 2;
91+
padding-top: 0pt;
92+
line-height: 1.15;
93+
orphans: 2;
94+
text-align: left;
95+
color: #000000;
96+
font-size: 21pt;
97+
font-family: "Trebuchet MS";
98+
padding-bottom: 0pt;
99+
page-break-after: avoid
100+
}
101+
.subtitle {
102+
widows: 2;
103+
padding-top: 0pt;
104+
line-height: 1.15;
105+
orphans: 2;
106+
text-align: left;
107+
color: #666666;
108+
font-style: italic;
109+
font-size: 13pt;
110+
font-family: "Trebuchet MS";
111+
padding-bottom: 10pt;
112+
page-break-after: avoid
113+
}
114+
li {
115+
color: #000000;
116+
font-size: 11pt;
117+
font-family: "Arial"
118+
}
119+
p {
120+
color: #000000;
121+
font-size: 11pt;
122+
margin: 0;
123+
font-family: "Arial"
124+
}
125+
h1 {
126+
widows: 2;
127+
padding-top: 10pt;
128+
line-height: 1.15;
129+
orphans: 2;
130+
text-align: left;
131+
color: #000000;
132+
font-size: 16pt;
133+
font-family: "Trebuchet MS";
134+
padding-bottom: 0pt;
135+
page-break-after: avoid
136+
}
137+
h2 {
138+
widows: 2;
139+
padding-top: 10pt;
140+
line-height: 1.15;
141+
orphans: 2;
142+
text-align: left;
143+
color: #000000;
144+
font-size: 13pt;
145+
font-family: "Trebuchet MS";
146+
font-weight: bold;
147+
padding-bottom: 0pt;
148+
page-break-after: avoid
149+
}
150+
h3 {
151+
widows: 2;
152+
padding-top: 8pt;
153+
line-height: 1.15;
154+
orphans: 2;
155+
text-align: left;
156+
color: #666666;
157+
font-size: 12pt;
158+
font-family: "Trebuchet MS";
159+
font-weight: bold;
160+
padding-bottom: 0pt;
161+
page-break-after: avoid
162+
}
163+
h4 {
164+
widows: 2;
165+
padding-top: 8pt;
166+
line-height: 1.15;
167+
orphans: 2;
168+
text-align: left;
169+
color: #666666;
170+
font-size: 11pt;
171+
text-decoration: underline;
172+
font-family: "Trebuchet MS";
173+
padding-bottom: 0pt;
174+
page-break-after: avoid
175+
}
176+
h5 {
177+
widows: 2;
178+
padding-top: 8pt;
179+
line-height: 1.15;
180+
orphans: 2;
181+
text-align: left;
182+
color: #666666;
183+
font-size: 11pt;
184+
font-family: "Trebuchet MS";
185+
padding-bottom: 0pt;
186+
page-break-after: avoid
187+
}
188+
h6 {
189+
widows: 2;
190+
padding-top: 8pt;
191+
line-height: 1.15;
192+
orphans: 2;
193+
text-align: left;
194+
color: #666666;
195+
font-style: italic;
196+
font-size: 11pt;
197+
font-family: "Trebuchet MS";
198+
padding-bottom: 0pt;
199+
page-break-after: avoid
200+
}
201+
</style>
202+
</head>
203+
204+
<body class="c4">
205+
<ul class="c0 lst-kix_id6kp4jljtnx-0 start">
206+
<li class="c1 c3"><span>Bullet point</span>
207+
</li>
208+
</ul>
209+
<ul class="c0 lst-kix_id6kp4jljtnx-1 start">
210+
<li class="c1 c5"><span>Indented bullet point</span>
211+
</li>
212+
</ul>
213+
</body>
214+
215+
</html>

test/test_word_to_markdown_lists.rb

+4
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ class TestWordToMarkdownLists < Test::Unit::TestCase
2222
validate_fixture "nested-ul", "- One\n - Sub one\n - Sub sub one\n - Sub sub two\n\n - Sub two\n\n- Two"
2323
end
2424

25+
should "parse gdoc nested uls" do
26+
validate_fixture "gdoc", "- Bullet point\n\n - Indented bullet point"
27+
end
28+
2529
should "parse left margin" do
2630
doc = WordToMarkdown.new "<p style='margin-left: 25px'>foo</p>"
2731
assert_equal 25, doc.doc.css("p").first.left_margin

0 commit comments

Comments
 (0)