Skip to content

Commit 57f6786

Browse files
yuyutaotaozhoushaw
andauthored
feat(extract-data): extract data from same-origin iframe (#258)
* feat: extract data from same-origin iframe * fix: ci snapshot * fix: extracting timeout error * fix: ci timeout * fix: update assets * feat: set default size of yaml as 1920x1080 * chore: update default viewport size --------- Co-authored-by: zhouxiao.shaw <[email protected]>
1 parent 37d8aad commit 57f6786

File tree

19 files changed

+277
-117
lines changed

19 files changed

+277
-117
lines changed

apps/site/docs/en/faq.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ There are some limitations with Midscene. We are still working on them.
1212

1313
1. The interaction types are limited to only tap, type, keyboard press, and scroll.
1414
2. LLM is not 100% stable. Even GPT-4o can't return the right answer all the time. Following the [Prompting Tips](./prompting-tips) will help improve stability.
15-
3. Since we use JavaScript to retrieve elements from the page, the elements inside the iframe cannot be accessed.
15+
3. Since we use JavaScript to retrieve elements from the page, the elements inside the cross-origin iframe cannot be accessed.
1616
4. We cannot access the native elements of Chrome, like the right-click context menu or file upload dialog.
1717
5. Do not use Midscene to bypass CAPTCHA. Some LLM services are set to decline requests that involve CAPTCHA-solving (e.g., OpenAI), while the DOM of some CAPTCHA pages is not accessible by regular web scraping methods. Therefore, using Midscene to bypass CAPTCHA is not a reliable method.
1818

apps/site/docs/zh/faq.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ Midscene 存在一些局限性,我们仍在努力改进。
1414

1515
1. 交互类型有限:目前仅支持点击、输入、键盘和滚动操作。
1616
2. 稳定性风险:即使是 GPT-4o 也无法确保 100% 返回正确答案。遵循 [编写提示词的技巧](./prompting-tips) 可以帮助提高 SDK 稳定性。
17-
3. 元素访问受限:由于我们使用 JavaScript 从页面提取元素,所以无法访问 iframe 内部的元素。
17+
3. 元素访问受限:由于我们使用 JavaScript 从页面提取元素,所以无法访问跨域 iframe 内部的元素。
1818
4. 无法访问 Chrome 原生元素:无法访问右键菜单、文件上传对话框等。
1919
5. 无法绕过验证码:有些 LLM 服务会拒绝涉及验证码解决的请求(例如 OpenAI),而有些验证码页面的 DOM 无法通过常规的网页抓取方法访问。因此,使用 Midscene 绕过验证码不是一个可靠的方法。
2020

packages/web-integration/package.json

+1
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@
145145
"devtools-protocol": "0.0.1380148",
146146
"dotenv": "16.4.5",
147147
"fs-extra": "11.2.0",
148+
"http-server": "14.1.1",
148149
"js-sha256": "0.11.0",
149150
"js-yaml": "4.1.0",
150151
"playwright": "1.44.1",

packages/web-integration/src/extractor/util.ts

+73-54
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,11 @@ function selectorForValue(val: number | string): string {
3333
export function setDataForNode(
3434
node: HTMLElement | Node,
3535
nodeHash: string,
36-
setToParentNode = false,
36+
setToParentNode: boolean, // should be false for default
37+
currentWindow: typeof window,
3738
): string {
3839
const taskId = taskIdKey;
39-
if (!(node instanceof Element)) {
40+
if (!(node instanceof currentWindow.HTMLElement)) {
4041
return '';
4142
}
4243
if (!taskId) {
@@ -47,7 +48,7 @@ export function setDataForNode(
4748
const selector = selectorForValue(nodeHash);
4849
if (getDebugMode()) {
4950
if (setToParentNode) {
50-
if (node.parentNode instanceof HTMLElement) {
51+
if (node.parentNode instanceof currentWindow.HTMLElement) {
5152
node.parentNode.setAttribute(taskIdKey, nodeHash.toString());
5253
}
5354
} else {
@@ -57,17 +58,25 @@ export function setDataForNode(
5758
return selector;
5859
}
5960

60-
function isElementPartiallyInViewport(rect: ReturnType<typeof getRect>) {
61+
function isElementPartiallyInViewport(
62+
rect: ReturnType<typeof getRect>,
63+
currentWindow: typeof window,
64+
currentDocument: typeof document,
65+
) {
6166
const elementHeight = rect.height;
6267
const elementWidth = rect.width;
6368

6469
const viewportRect = {
6570
left: 0,
6671
top: 0,
67-
width: window.innerWidth || document.documentElement.clientWidth,
68-
height: window.innerHeight || document.documentElement.clientHeight,
69-
right: window.innerWidth || document.documentElement.clientWidth,
70-
bottom: window.innerHeight || document.documentElement.clientHeight,
72+
width:
73+
currentWindow.innerWidth || currentDocument.documentElement.clientWidth,
74+
height:
75+
currentWindow.innerHeight || currentDocument.documentElement.clientHeight,
76+
right:
77+
currentWindow.innerWidth || currentDocument.documentElement.clientWidth,
78+
bottom:
79+
currentWindow.innerHeight || currentDocument.documentElement.clientHeight,
7180
x: 0,
7281
y: 0,
7382
zoom: 1,
@@ -84,17 +93,20 @@ function isElementPartiallyInViewport(rect: ReturnType<typeof getRect>) {
8493
return visibleArea / totalArea >= 2 / 3;
8594
}
8695

87-
export function getPseudoElementContent(element: Node): {
96+
export function getPseudoElementContent(
97+
element: Node,
98+
currentWindow: typeof window,
99+
): {
88100
before: string;
89101
after: string;
90102
} {
91-
if (!(element instanceof HTMLElement)) {
103+
if (!(element instanceof currentWindow.HTMLElement)) {
92104
return { before: '', after: '' };
93105
}
94-
const beforeContent = window
106+
const beforeContent = currentWindow
95107
.getComputedStyle(element, '::before')
96108
.getPropertyValue('content');
97-
const afterContent = window
109+
const afterContent = currentWindow
98110
.getComputedStyle(element, '::after')
99111
.getPropertyValue('content');
100112
return {
@@ -103,8 +115,11 @@ export function getPseudoElementContent(element: Node): {
103115
};
104116
}
105117

106-
export function hasOverflowY(element: HTMLElement): boolean {
107-
const style = window.getComputedStyle(element);
118+
export function hasOverflowY(
119+
element: HTMLElement,
120+
currentWindow: typeof window,
121+
): boolean {
122+
const style = currentWindow.getComputedStyle(element);
108123
return (
109124
style.overflowY === 'scroll' ||
110125
style.overflowY === 'auto' ||
@@ -149,18 +164,22 @@ export function overlappedRect(
149164
return null;
150165
}
151166

152-
export function getRect(el: HTMLElement | Node, baseZoom = 1): ExtractedRect {
167+
export function getRect(
168+
el: HTMLElement | Node,
169+
baseZoom: number, // base zoom
170+
currentWindow: typeof window,
171+
): ExtractedRect {
153172
let originalRect: DOMRect;
154173
let newZoom = 1;
155-
if (!(el instanceof HTMLElement)) {
156-
const range = document.createRange();
174+
if (!(el instanceof currentWindow.HTMLElement)) {
175+
const range = currentWindow.document.createRange();
157176
range.selectNodeContents(el);
158177
originalRect = range.getBoundingClientRect();
159178
} else {
160179
originalRect = el.getBoundingClientRect();
161180
// from Chrome v128, the API would return differently https://docs.google.com/document/d/1AcnDShjT-kEuRaMchZPm5uaIgNZ4OiYtM4JI9qiV8Po/edit
162181
if (!('currentCSSZoom' in el)) {
163-
newZoom = Number.parseFloat(window.getComputedStyle(el).zoom) || 1;
182+
newZoom = Number.parseFloat(currentWindow.getComputedStyle(el).zoom) || 1;
164183
}
165184
}
166185

@@ -179,13 +198,17 @@ export function getRect(el: HTMLElement | Node, baseZoom = 1): ExtractedRect {
179198
};
180199
}
181200

182-
const isElementCovered = (el: HTMLElement | Node, rect: ExtractedRect) => {
201+
const isElementCovered = (
202+
el: HTMLElement | Node,
203+
rect: ExtractedRect,
204+
currentWindow: typeof window,
205+
) => {
183206
// Gets the center coordinates of the element
184207
const x = rect.left + rect.width / 2;
185208
const y = rect.top + rect.height / 2;
186209

187210
// Gets the element above that point
188-
const topElement = document.elementFromPoint(x, y);
211+
const topElement = currentWindow.document.elementFromPoint(x, y);
189212
if (!topElement) {
190213
return false; // usually because it's outside the screen
191214
}
@@ -201,7 +224,7 @@ const isElementCovered = (el: HTMLElement | Node, rect: ExtractedRect) => {
201224
return false;
202225
}
203226

204-
const rectOfTopElement = getRect(topElement as HTMLElement, 1);
227+
const rectOfTopElement = getRect(topElement as HTMLElement, 1, currentWindow);
205228

206229
// get the remaining area of the base element
207230
const overlapRect = overlappedRect(rect, rectOfTopElement);
@@ -232,6 +255,8 @@ const isElementCovered = (el: HTMLElement | Node, rect: ExtractedRect) => {
232255

233256
export function visibleRect(
234257
el: HTMLElement | Node | null,
258+
currentWindow: typeof window,
259+
currentDocument: typeof document,
235260
baseZoom = 1,
236261
):
237262
| { left: number; top: number; width: number; height: number; zoom: number }
@@ -242,16 +267,16 @@ export function visibleRect(
242267
}
243268

244269
if (
245-
!(el instanceof HTMLElement) &&
270+
!(el instanceof currentWindow.HTMLElement) &&
246271
el.nodeType !== Node.TEXT_NODE &&
247272
el.nodeName.toLowerCase() !== 'svg'
248273
) {
249274
logger(el, 'Element is not in the DOM hierarchy');
250275
return false;
251276
}
252277

253-
if (el instanceof HTMLElement) {
254-
const style = window.getComputedStyle(el);
278+
if (el instanceof currentWindow.HTMLElement) {
279+
const style = currentWindow.getComputedStyle(el);
255280
if (
256281
style.display === 'none' ||
257282
style.visibility === 'hidden' ||
@@ -262,7 +287,7 @@ export function visibleRect(
262287
}
263288
}
264289

265-
const rect = getRect(el, baseZoom);
290+
const rect = getRect(el, baseZoom, currentWindow);
266291

267292
if (rect.width === 0 && rect.height === 0) {
268293
logger(el, 'Element has no size');
@@ -271,18 +296,24 @@ export function visibleRect(
271296

272297
// check if the element is covered by another element
273298
// if the element is zoomed, the coverage check should be done with the original zoom
274-
if (baseZoom === 1 && isElementCovered(el, rect)) {
299+
if (baseZoom === 1 && isElementCovered(el, rect, currentWindow)) {
275300
return false;
276301
}
277302

278-
const scrollLeft = window.pageXOffset || document.documentElement.scrollLeft;
279-
const scrollTop = window.pageYOffset || document.documentElement.scrollTop;
303+
const scrollLeft =
304+
currentWindow.pageXOffset || currentDocument.documentElement.scrollLeft;
305+
const scrollTop =
306+
currentWindow.pageYOffset || currentDocument.documentElement.scrollTop;
280307
const viewportWidth =
281-
window.innerWidth || document.documentElement.clientWidth;
308+
currentWindow.innerWidth || currentDocument.documentElement.clientWidth;
282309
const viewportHeight =
283-
window.innerHeight || document.documentElement.clientHeight;
310+
currentWindow.innerHeight || currentDocument.documentElement.clientHeight;
284311

285-
const isPartiallyInViewport = isElementPartiallyInViewport(rect);
312+
const isPartiallyInViewport = isElementPartiallyInViewport(
313+
rect,
314+
currentWindow,
315+
currentDocument,
316+
);
286317

287318
if (!isPartiallyInViewport) {
288319
logger(el, 'Element is completely outside the viewport', {
@@ -297,14 +328,14 @@ export function visibleRect(
297328

298329
// check if the element is hidden by an ancestor
299330
let parent: HTMLElement | Node | null = el;
300-
while (parent && parent !== document.body) {
301-
if (!(parent instanceof HTMLElement)) {
331+
while (parent && parent !== currentDocument.body) {
332+
if (!(parent instanceof currentWindow.HTMLElement)) {
302333
parent = parent.parentElement;
303334
continue;
304335
}
305-
const parentStyle = window.getComputedStyle(parent);
336+
const parentStyle = currentWindow.getComputedStyle(parent);
306337
if (parentStyle.overflow === 'hidden') {
307-
const parentRect = getRect(parent, 1);
338+
const parentRect = getRect(parent, 1, currentWindow);
308339
const tolerance = 10;
309340

310341
if (
@@ -348,23 +379,6 @@ export function validTextNodeContent(node: Node): string | false {
348379
return false;
349380
}
350381

351-
// const everyChildNodeIsText = Array.from(node.childNodes).every((child) => {
352-
// const tagName = ((child as HTMLElement).tagName || '').toLowerCase();
353-
// if (
354-
// tagName === 'script' ||
355-
// tagName === 'style' ||
356-
// tagName === 'link' ||
357-
// tagName !== '#text'
358-
// ) {
359-
// return false;
360-
// }
361-
// return true;
362-
// });
363-
364-
// if (!everyChildNodeIsText) {
365-
// return false;
366-
// }
367-
368382
const content = node.textContent || (node as HTMLElement).innerText;
369383
if (content && !/^\s*$/.test(content)) {
370384
return content.trim();
@@ -375,8 +389,13 @@ export function validTextNodeContent(node: Node): string | false {
375389

376390
export function getNodeAttributes(
377391
node: HTMLElement | Node,
392+
currentWindow: typeof window,
378393
): Record<string, string> {
379-
if (!node || !(node instanceof HTMLElement) || !node.attributes) {
394+
if (
395+
!node ||
396+
!(node instanceof currentWindow.HTMLElement) ||
397+
!node.attributes
398+
) {
380399
return {};
381400
}
382401

@@ -464,7 +483,7 @@ export function setExtractTextWithPositionOnWindow() {
464483
}
465484
}
466485

467-
export function getDocument(): HTMLElement {
486+
export function getTopDocument(): HTMLElement {
468487
const container: HTMLElement = document.body || document;
469488
return container;
470489
}

0 commit comments

Comments
 (0)