use patchright and temporary profile to fix issues, added debug output

This commit is contained in:
Mantao Huang 2026-03-12 10:58:14 -04:00
parent dd5c159108
commit 11cf99e63a

View File

@ -1,259 +1,504 @@
import argparse
import asyncio
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
import io
import json
import os
import tempfile
import urllib.request
import zipfile
import io
import sys
import argparse
from datetime import datetime
EXTENSION_ID = 'ekhagklcjbdpajgpjgmbionohlpdbjgc'
from patchright.async_api import async_playwright
EXTENSION_ID = "ekhagklcjbdpajgpjgmbionohlpdbjgc"
EXTENSION_URL = (
"https://clients2.google.com/service/update2/crx"
"?response=redirect&os=mac&arch=x86-64&os_arch=x86-64&nacl_arch=x86-64"
"&prod=chromecrx&prodchannel=&prodversion=114.0.5735.90&lang=en-US"
f"&acceptformat=crx3&x=id%3D{EXTENSION_ID}%26installsource%3Dondemand%26uc"
)
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
EXTENSION_DIR = os.path.join(BASE_DIR, "zotero_extension")
WINDOW_SIZE = {"width": 1280, "height": 800}
TRANSLATOR_WAIT_SECONDS = 3
WELCOME_TAB_WAIT_SECONDS = 2
POST_SAVE_WAIT_SECONDS = 5
SERVICE_WORKER_POLL_ATTEMPTS = 60
SERVICE_WORKER_POLL_INTERVAL = 0.5
SERVICE_WORKER_DEBUG_ATTEMPTS = {0, 5, 10, 20, 40, 59}
SAVE_SCRIPT = r"""
async ({ libraryName, collectionId }) => {
const debug = [];
const pushDebug = (label, value = null) => debug.push({ label, value });
function normalizeCollectionId(value) {
if (!value) return null;
const trimmed = value.trim();
if (/^[cC]\d+$/.test(trimmed)) return `C${trimmed.slice(1)}`;
if (/^\d+$/.test(trimmed)) return `C${trimmed}`;
return trimmed;
}
function summarizeTargets(targets) {
return targets.map((target) => ({
id: target.id,
name: target.name,
level: target.level
}));
}
function summarizeTranslators(translators = []) {
return translators.map((translator) => ({
translatorID: translator.translatorID,
label: translator.label,
itemType: translator.itemType,
priority: translator.priority
}));
}
function findOwningLibrary(targets, targetId) {
const index = targets.findIndex((target) => target.id === targetId);
if (index === -1) return null;
let level = targets[index].level || 0;
for (let i = index - 1; i >= 0; i -= 1) {
const candidate = targets[i];
const candidateLevel = candidate.level || 0;
if (candidateLevel >= level) continue;
if (candidate.id.startsWith("L")) return candidate;
level = candidateLevel;
}
return targets[index].id.startsWith("L") ? targets[index] : null;
}
async function resolveTarget() {
if (!libraryName && !collectionId) {
pushDebug("resolved target", null);
return null;
}
const response = await Zotero.Connector.callMethod("getSelectedCollection", {
switchToReadableLibrary: true
});
const targets = response.targets || [];
pushDebug("available targets", summarizeTargets(targets));
if (!targets.length) {
throw new Error("Zotero did not return any selectable targets.");
}
let libraryTarget = null;
if (libraryName) {
const normalizedLibraryName = libraryName.trim().toLowerCase();
const matches = targets.filter(
(target) =>
target.id.startsWith("L") &&
target.name.trim().toLowerCase() === normalizedLibraryName
);
if (!matches.length) {
throw new Error(`Library '${libraryName}' was not found.`);
}
if (matches.length > 1) {
throw new Error(`Library '${libraryName}' is ambiguous.`);
}
libraryTarget = matches[0];
}
let collectionTarget = null;
if (collectionId) {
const normalizedCollectionId = normalizeCollectionId(collectionId);
collectionTarget = targets.find((target) => target.id === normalizedCollectionId);
if (!collectionTarget) {
throw new Error(`Collection '${collectionId}' was not found.`);
}
if (libraryTarget) {
const owningLibrary = findOwningLibrary(targets, collectionTarget.id);
if (!owningLibrary || owningLibrary.id !== libraryTarget.id) {
throw new Error(
`Collection '${collectionId}' does not belong to library '${libraryName}'.`
);
}
}
}
const target = collectionTarget || libraryTarget;
pushDebug("resolved target", target ? { id: target.id, name: target.name } : null);
return target;
}
function summarizeTabInfo(tabInfo) {
if (!tabInfo) return null;
return {
url: tabInfo.url,
isPDF: Boolean(tabInfo.isPDF),
frameId: tabInfo.frameId,
translatorCount: tabInfo.translators?.length || 0,
translators: summarizeTranslators(tabInfo.translators)
};
}
function installSessionHooks(target) {
const originalCallMethodWithCookies =
Zotero.Connector.callMethodWithCookies.bind(Zotero.Connector);
const originalSaveStandaloneAttachment =
Zotero.ItemSaver?.saveStandaloneAttachmentToZotero?.bind(Zotero.ItemSaver);
async function applyTargetToSession(sessionID) {
if (!target || !sessionID) return;
pushDebug("apply target to session", { sessionID, targetId: target.id });
await Zotero.Connector.callMethod("updateSession", {
sessionID,
target: target.id
});
}
Zotero.Connector.callMethodWithCookies = async function(method, payload, ...args) {
pushDebug("callMethodWithCookies request", {
method,
hasPayload: Boolean(payload),
sessionID: payload?.sessionID || null
});
const result = await originalCallMethodWithCookies(method, payload, ...args);
pushDebug("callMethodWithCookies response", { method, result });
if ((method === "saveItems" || method === "saveSnapshot") && payload?.sessionID) {
await applyTargetToSession(payload.sessionID);
}
return result;
};
if (originalSaveStandaloneAttachment) {
Zotero.ItemSaver.saveStandaloneAttachmentToZotero = async function(
attachment,
sessionID,
...args
) {
pushDebug("saveStandaloneAttachmentToZotero request", {
title: attachment?.title || null,
url: attachment?.url || null,
sessionID
});
const result = await originalSaveStandaloneAttachment(
attachment,
sessionID,
...args
);
pushDebug("saveStandaloneAttachmentToZotero response", result);
await applyTargetToSession(sessionID);
return result;
};
}
return () => {
Zotero.Connector.callMethodWithCookies = originalCallMethodWithCookies;
if (originalSaveStandaloneAttachment) {
Zotero.ItemSaver.saveStandaloneAttachmentToZotero =
originalSaveStandaloneAttachment;
}
};
}
async function runSave() {
pushDebug("connector online", await Zotero.Connector.checkIsOnline());
const tabs = await chrome.tabs.query({ active: true, currentWindow: true });
if (!tabs?.length) {
return { error: "No active tab found.", debug };
}
const tab = tabs[0];
pushDebug("active tab", {
id: tab.id,
url: tab.url,
title: tab.title,
status: tab.status
});
const tabInfo = Zotero.Connector_Browser.getTabInfo(tab.id);
pushDebug("tab info", summarizeTabInfo(tabInfo));
if (!tabInfo) {
return { error: "No translator or webpage saving options available.", debug };
}
if (tabInfo.translators?.length) {
const result = await Zotero.Connector_Browser.saveWithTranslator(tab, 0, {
fallbackOnFailure: true
});
pushDebug("saveWithTranslator result", result);
return { ok: true, mode: "translator", result, debug };
}
const result = await Zotero.Connector_Browser.saveAsWebpage(tab, tabInfo.frameId, {
snapshot: true
});
pushDebug("saveAsWebpage result", result);
return { ok: true, mode: "webpage", result, debug };
}
try {
const target = await resolveTarget();
const restoreHooks = installSessionHooks(target);
try {
const result = await runSave();
return { ...result, target };
} finally {
restoreHooks();
}
} catch (error) {
pushDebug("caught error", {
message: error.message,
stack: error.stack
});
return { error: error.message, debug };
}
}
"""
def debug_log(label, value=None):
timestamp = datetime.now().strftime("%H:%M:%S")
if value is None:
print(f"[debug {timestamp}] {label}")
return
if isinstance(value, (dict, list, tuple)):
try:
value = json.dumps(value, ensure_ascii=True, default=str, indent=2)
except TypeError:
value = repr(value)
print(f"[debug {timestamp}] {label}: {value}")
# Fetch extension directly from the alternative Chrome Webstore endpoint
EXTENSION_URL = f"https://clients2.google.com/service/update2/crx?response=redirect&os=mac&arch=x86-64&os_arch=x86-64&nacl_arch=x86-64&prod=chromecrx&prodchannel=&prodversion=114.0.5735.90&lang=en-US&acceptformat=crx3&x=id%3D{EXTENSION_ID}%26installsource%3Dondemand%26uc"
EXTENSION_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'zotero_extension')
USER_DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'chrome_profile')
print(EXTENSION_DIR)
def setup_extension():
"""Downloads and unpacks the Zotero Connector Chrome extension if not already present."""
if os.path.exists(EXTENSION_DIR) and os.path.exists(os.path.join(EXTENSION_DIR, 'manifest.json')):
"""Download and unpack the Zotero Connector extension if needed."""
manifest_path = os.path.join(EXTENSION_DIR, "manifest.json")
if os.path.exists(manifest_path):
print("[*] Zotero Extension already unpacked locally.")
return os.path.abspath(EXTENSION_DIR)
print("[*] Downloading Zotero Connector")
req = urllib.request.Request(
request = urllib.request.Request(
EXTENSION_URL,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"},
)
with urllib.request.urlopen(req) as response:
with urllib.request.urlopen(request) as response:
data = response.read()
print("[*] Unpacking CRX file...")
# .crx files are zip files with an extra header. Find the standard ZIP header (PK\x03\x04).
zip_start = data.find(b'PK\x03\x04')
zip_start = data.find(b"PK\x03\x04")
if zip_start == -1:
raise ValueError("Could not find ZIP header in downloaded CRX.")
os.makedirs(EXTENSION_DIR, exist_ok=True)
with zipfile.ZipFile(io.BytesIO(data[zip_start:])) as z:
z.extractall(EXTENSION_DIR)
with zipfile.ZipFile(io.BytesIO(data[zip_start:])) as archive:
archive.extractall(EXTENSION_DIR)
print("[*] Zotero Extension setup complete.")
return os.path.abspath(EXTENSION_DIR)
async def save_to_zotero(url, headless_mode="new", library_name=None, collection_id=None):
"""Automates Chrome to load a URL and trigger Zotero Connector."""
extension_path = setup_extension()
print(f"[*] Launching Chromium browser (headless={headless_mode}) with Zotero Connector...")
# Prepare playwright arguments
def get_browser_launch_config(extension_path, headless_mode):
args = [
f"--disable-extensions-except={extension_path}",
f"--load-extension={extension_path}",
]
# Standard headless=True historically blocked extensions.
# We use headless=False by default or can pass `--headless=new` parameter.
if headless_mode == "new":
args.append("--headless=new")
playwright_headless = False
elif headless_mode == "false":
playwright_headless = False
else:
playwright_headless = True # May ignore extensions depending on Chromium version
playwright_headless = True
async with Stealth().use_async(async_playwright()) as p:
browser_context = await p.chromium.launch_persistent_context(
USER_DATA_DIR,
headless=playwright_headless,
args=args,
viewport={'width': 1280, 'height': 800}
)
return playwright_headless, args
# Close any welcome tabs the extension might open on first run
await asyncio.sleep(2)
if len(browser_context.pages) > 1:
for p_ext in browser_context.pages[1:]:
await p_ext.close()
# Use the primary tab
page = browser_context.pages[0] if browser_context.pages else await browser_context.new_page()
async def close_extra_pages(browser_context):
await asyncio.sleep(WELCOME_TAB_WAIT_SECONDS)
extra_pages = browser_context.pages[1:]
if not extra_pages:
return
print(f"[*] Navigating to {url}...")
await page.goto(url, wait_until="load")
debug_log("closing extra tabs", [page.url for page in extra_pages])
for extra_page in extra_pages:
await extra_page.close()
print("[*] Page loaded. Waiting for Zotero translator to initialize...")
# Give Zotero connection translator a moment to inject and detect the metadata
await asyncio.sleep(3)
print("[*] Finding Zotero Connector service worker...")
worker = None
for i in range(60):
for w in browser_context.service_workers:
if "background-worker.js" in w.url or "zotero" in w.url:
worker = w
break
if worker:
break
await asyncio.sleep(0.5)
if not worker:
print("[!] Could not find Zotero extension service worker.")
else:
assert worker is not None
print("[*] Triggering save via extension service worker evaluation...")
save_result = await worker.evaluate('''async ({ libraryName, collectionId }) => {
function normalizeCollectionId(value) {
if (!value) return null;
const trimmed = value.trim();
if (/^[cC]\\d+$/.test(trimmed)) return `C${trimmed.slice(1)}`;
if (/^\\d+$/.test(trimmed)) return `C${trimmed}`;
return trimmed;
}
async def get_primary_page(browser_context):
await close_extra_pages(browser_context)
page = browser_context.pages[0] if browser_context.pages else await browser_context.new_page()
debug_log(
"active page before navigation",
{"url": page.url, "page_count": len(browser_context.pages)},
)
return page
function findOwningLibrary(targets, targetId) {
let index = targets.findIndex((target) => target.id === targetId);
if (index === -1) return null;
let currentLevel = targets[index].level || 0;
for (let i = index - 1; i >= 0; i--) {
let candidate = targets[i];
let candidateLevel = candidate.level || 0;
if (candidateLevel < currentLevel) {
if (candidate.id.startsWith("L")) {
return candidate;
}
currentLevel = candidateLevel;
}
}
return targets[index].id.startsWith("L") ? targets[index] : null;
}
async def wait_for_service_worker(browser_context):
print("[*] Finding Zotero Connector service worker...")
for attempt in range(SERVICE_WORKER_POLL_ATTEMPTS):
workers = list(browser_context.service_workers)
if attempt in SERVICE_WORKER_DEBUG_ATTEMPTS:
debug_log(
"service worker poll",
{
"attempt": attempt + 1,
"known_workers": [worker.url for worker in workers],
},
)
async function resolveTarget() {
if (!libraryName && !collectionId) return null;
for worker in workers:
if "background-worker.js" in worker.url or "zotero" in worker.url:
debug_log("selected service worker", worker.url)
return worker
let response = await Zotero.Connector.callMethod("getSelectedCollection", { switchToReadableLibrary: true });
let targets = response.targets || [];
if (!targets.length) {
throw new Error("Zotero did not return any selectable targets.");
}
await asyncio.sleep(SERVICE_WORKER_POLL_INTERVAL)
let libraryTarget = null;
if (libraryName) {
let normalizedLibraryName = libraryName.trim().toLowerCase();
let matches = targets.filter((target) =>
target.id.startsWith("L") && target.name.trim().toLowerCase() === normalizedLibraryName
);
if (!matches.length) {
throw new Error(`Library '${libraryName}' was not found.`);
}
if (matches.length > 1) {
throw new Error(`Library '${libraryName}' is ambiguous.`);
}
libraryTarget = matches[0];
}
return None
let collectionTarget = null;
if (collectionId) {
let normalizedCollectionId = normalizeCollectionId(collectionId);
collectionTarget = targets.find((target) => target.id === normalizedCollectionId);
if (!collectionTarget) {
throw new Error(`Collection '${collectionId}' was not found.`);
}
if (libraryTarget) {
let owningLibrary = findOwningLibrary(targets, collectionTarget.id);
if (!owningLibrary || owningLibrary.id !== libraryTarget.id) {
throw new Error(
`Collection '${collectionId}' does not belong to library '${libraryName}'.`
);
}
}
}
return collectionTarget || libraryTarget;
}
async def navigate_to_page(page, url, browser_context):
print(f"[*] Navigating to {url}...")
response = await page.goto(url, wait_until="load")
debug_log(
"navigation result",
{
"response_url": response.url if response else None,
"status": response.status if response else None,
"final_page_url": page.url,
"title": await page.title(),
},
)
let target = await resolveTarget();
let applyTargetToSession = async (sessionID) => {
if (!target || !sessionID) return;
await Zotero.Connector.callMethod("updateSession", { sessionID, target: target.id });
};
print("[*] Page loaded. Waiting for Zotero translator to initialize...")
await asyncio.sleep(TRANSLATOR_WAIT_SECONDS)
debug_log(
"post-load page snapshot",
{
"url": page.url,
"title": await page.title(),
"service_workers": [worker.url for worker in browser_context.service_workers],
},
)
let originalCallMethodWithCookies = Zotero.Connector.callMethodWithCookies.bind(Zotero.Connector);
let originalSaveStandaloneAttachment = Zotero.ItemSaver?.saveStandaloneAttachmentToZotero?.bind(Zotero.ItemSaver);
Zotero.Connector.callMethodWithCookies = async function(method, payload, ...args) {
let result = await originalCallMethodWithCookies(method, payload, ...args);
if ((method === "saveItems" || method === "saveSnapshot") && payload?.sessionID) {
await applyTargetToSession(payload.sessionID);
}
return result;
};
def log_save_result(save_result):
if not save_result or "error" in save_result:
error = save_result.get("error") if save_result else "Unknown error"
print(f"[!] Save trigger failed: {error}")
else:
print(f"[*] Save completed successfully via {save_result.get('mode', 'unknown')}.")
target = save_result.get("target")
if target:
print(f"[*] Save target: {target.get('name')} ({target.get('id')})")
if save_result.get("result") is not None:
print(f"[*] Save returned: {save_result['result']}")
if (originalSaveStandaloneAttachment) {
Zotero.ItemSaver.saveStandaloneAttachmentToZotero = async function(attachment, sessionID, ...args) {
let result = await originalSaveStandaloneAttachment(attachment, sessionID, ...args);
await applyTargetToSession(sessionID);
return result;
};
}
debug_log("save_result", save_result)
for index, entry in enumerate(save_result.get("debug", []), start=1):
debug_log(f"worker debug #{index} {entry.get('label')}", entry.get("value"))
try {
let tabs = await chrome.tabs.query({ active: true, currentWindow: true });
if (!tabs || tabs.length === 0) return {error: "No active tab found."};
let tab = tabs[0];
let tabInfo = Zotero.Connector_Browser.getTabInfo(tab.id);
if (tabInfo && tabInfo.translators && tabInfo.translators.length) {
let result = await Zotero.Connector_Browser.saveWithTranslator(tab, 0, {fallbackOnFailure: true});
return { ok: true, mode: "translator", result, target };
} else if (tabInfo) {
let result = await Zotero.Connector_Browser.saveAsWebpage(tab, tabInfo.frameId, { snapshot: true });
return { ok: true, mode: "webpage", result, target };
} else {
return {error: "No translator or webpage saving options available."};
}
} catch(e) {
return {error: e.message};
} finally {
Zotero.Connector.callMethodWithCookies = originalCallMethodWithCookies;
if (originalSaveStandaloneAttachment) {
Zotero.ItemSaver.saveStandaloneAttachmentToZotero = originalSaveStandaloneAttachment;
}
}
}''', {"libraryName": library_name, "collectionId": collection_id})
async def save_to_zotero(url, headless_mode="new", library_name=None, collection_id=None):
extension_path = setup_extension()
playwright_headless, browser_args = get_browser_launch_config(extension_path, headless_mode)
if not save_result or "error" in save_result:
print(f"[!] Save trigger failed: {save_result.get('error') if save_result else 'Unknown error'}")
else:
save_mode = save_result.get("mode", "unknown")
returned = save_result.get("result")
target = save_result.get("target")
print(f"[*] Save completed successfully via {save_mode}.")
if target:
print(f"[*] Save target: {target.get('name')} ({target.get('id')})")
if returned is not None:
print(f"[*] Save returned: {returned}")
print(f"[*] Launching Chromium browser (headless={headless_mode}) with Zotero Connector...")
debug_log(
"launch configuration",
{"playwright_headless": playwright_headless, "args": browser_args},
)
print("[*] Operation finished. Closing browser.")
await browser_context.close()
async with async_playwright() as playwright:
with tempfile.TemporaryDirectory(prefix="zotero-paper-fetcher-") as user_data_dir:
debug_log(
"save_to_zotero arguments",
{
"url": url,
"headless_mode": headless_mode,
"library_name": library_name,
"collection_id": collection_id,
"extension_path": extension_path,
"user_data_dir": user_data_dir,
},
)
browser_context = await playwright.chromium.launch_persistent_context(
user_data_dir,
headless=playwright_headless,
args=browser_args,
viewport=WINDOW_SIZE,
)
if __name__ == "__main__":
try:
debug_log(
"temporary context launched",
{
"initial_page_count": len(browser_context.pages),
"service_worker_count": len(browser_context.service_workers),
"user_data_dir": user_data_dir,
},
)
page = await get_primary_page(browser_context)
await navigate_to_page(page, url, browser_context)
worker = await wait_for_service_worker(browser_context)
if not worker:
print("[!] Could not find Zotero extension service worker.")
return
print("[*] Triggering save via extension service worker evaluation...")
save_result = await worker.evaluate(
SAVE_SCRIPT,
{"libraryName": library_name, "collectionId": collection_id},
)
log_save_result(save_result)
print("[*] Waiting 5 seconds for any delayed connector activity...")
await asyncio.sleep(POST_SAVE_WAIT_SECONDS)
finally:
print("[*] Operation finished. Closing browser.")
await browser_context.close()
def parse_args():
parser = argparse.ArgumentParser(description="Automate Zotero Connector via Playwright.")
parser.add_argument("url", nargs="?", default="https://arxiv.org/abs/1706.03762", help="URL to save to Zotero")
parser.add_argument("--headed", action="store_true", help="Show browser UI visually instead of headless=new")
parser.add_argument(
"url",
nargs="?",
default="https://arxiv.org/abs/1706.03762",
help="URL to save to Zotero",
)
parser.add_argument(
"--headed",
action="store_true",
help="Show browser UI visually instead of headless=new",
)
parser.add_argument("--library-name", help="Save into the library with this exact name")
parser.add_argument("--collection-id", help="Save into the collection with this ID, such as 13 or C13")
args = parser.parse_args()
parser.add_argument(
"--collection-id",
help="Save into the collection with this ID, such as 13 or C13",
)
return parser.parse_args()
headless_arg = "false" if args.headed else "new"
def main():
print(EXTENSION_DIR)
args = parse_args()
headless_mode = "false" if args.headed else "new"
asyncio.run(
save_to_zotero(
args.url,
headless_mode=headless_arg,
headless_mode=headless_mode,
library_name=args.library_name,
collection_id=args.collection_id,
)
)
if __name__ == "__main__":
main()