docs+skills: add main UI/UX visual-truth PRD and skill links

This commit is contained in:
ZenchantLive 2026-02-18 12:50:53 -08:00
parent 1c36223e7f
commit 14a50ad4ae
289 changed files with 54463 additions and 0 deletions

View file

@ -0,0 +1,297 @@
# Authentication Patterns
Login flows, OAuth, 2FA, and authenticated browsing.
**Related**: [session-management.md](session-management.md) for session details, [SKILL.md](../SKILL.md) for quick start.
## Contents
- [Basic Login Flow](#basic-login-flow)
- [OAuth / SSO Flows](#oauth--sso-flows)
- [Two-Factor Authentication](#two-factor-authentication)
- [Session Reuse Patterns](#session-reuse-patterns)
- [Cookie Extraction](#cookie-extraction)
- [Security Best Practices](#security-best-practices)
## Basic Login Flow
Standard username/password login:
```bash
#!/bin/bash
# Start session
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://app.example.com/login"
}' | jq -r '.session_id')
# Get form elements
# Expected: @e1 [input type="email"], @e2 [input type="password"], @e3 [button] "Sign In"
# Fill credentials
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "fill", "ref": "@e1", "text": "user@example.com"
}'
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "fill", "ref": "@e2", "text": "'"$PASSWORD"'"
}'
# Submit
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "click", "ref": "@e3"
}'
# Wait for redirect
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "wait", "wait_ms": 2000
}'
# Verify login succeeded
RESULT=$(infsh app run agent-browser --function snapshot --session $SESSION --input '{}')
URL=$(echo $RESULT | jq -r '.url')
if [[ "$URL" == *"/login"* ]]; then
echo "Login failed - still on login page"
exit 1
fi
echo "Login successful"
# Continue with authenticated actions...
```
## OAuth / SSO Flows
For OAuth redirects (Google, GitHub, etc.):
```bash
#!/bin/bash
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://app.example.com/auth/google"
}' | jq -r '.session_id')
# Wait for redirect to Google
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "wait", "wait_ms": 3000
}'
# Snapshot to see Google login form
RESULT=$(infsh app run agent-browser --function snapshot --session $SESSION --input '{}')
echo $RESULT | jq '.elements_text'
# Fill Google email
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "fill", "ref": "@e1", "text": "user@gmail.com"
}'
# Click Next
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "click", "ref": "@e2"
}'
# Wait and snapshot for password field
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "wait", "wait_ms": 2000
}'
RESULT=$(infsh app run agent-browser --function snapshot --session $SESSION --input '{}')
# Fill password
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "fill", "ref": "@e1", "text": "'"$GOOGLE_PASSWORD"'"
}'
# Click Sign in
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "click", "ref": "@e2"
}'
# Wait for redirect back to app
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "wait", "wait_ms": 5000
}'
# Verify we're back on the app
RESULT=$(infsh app run agent-browser --function snapshot --session $SESSION --input '{}')
URL=$(echo $RESULT | jq -r '.url')
echo "Final URL: $URL"
```
## Two-Factor Authentication
For 2FA, you may need human intervention or TOTP generation:
### With TOTP Code
```bash
# After password, check for 2FA prompt
RESULT=$(infsh app run agent-browser --function snapshot --session $SESSION --input '{}')
ELEMENTS=$(echo $RESULT | jq -r '.elements_text')
if echo "$ELEMENTS" | grep -qi "verification\|2fa\|authenticator"; then
# Generate TOTP code (requires oathtool)
TOTP_CODE=$(oathtool --totp -b "$TOTP_SECRET")
# Fill 2FA code
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "fill", "ref": "@e1", "text": "'"$TOTP_CODE"'"
}'
# Submit
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "click", "ref": "@e2"
}'
fi
```
### With Manual Intervention
For SMS or hardware token 2FA:
```bash
# Record video so user can see the 2FA prompt
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://app.example.com/login",
"record_video": true
}' | jq -r '.session_id')
# ... login flow ...
# At 2FA step, prompt user
echo "2FA code sent. Enter the code:"
read -r CODE
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "fill", "ref": "@e1", "text": "'"$CODE"'"
}'
```
## Session Reuse Patterns
Since sessions maintain cookies, you can reuse authenticated sessions:
```bash
#!/bin/bash
# login-and-work.sh
# Login once
login() {
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://app.example.com/login"
}' | jq -r '.session_id')
# ... login steps ...
echo $SESSION
}
# Do work with authenticated session
do_work() {
local SESSION=$1
# Navigate to protected page
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "goto", "url": "https://app.example.com/dashboard"
}'
# Extract data
infsh app run agent-browser --function snapshot --session $SESSION --input '{}'
}
# Main
SESSION=$(login)
do_work $SESSION
# Don't close if you want to reuse!
# infsh app run agent-browser --function close --session $SESSION --input '{}'
```
## Cookie Extraction
Extract cookies for use in other tools:
```bash
# Get cookies via JavaScript
RESULT=$(infsh app run agent-browser --function execute --session $SESSION --input '{
"code": "document.cookie"
}')
COOKIES=$(echo $RESULT | jq -r '.result')
echo "Cookies: $COOKIES"
# Get all cookies including httpOnly (more complete)
RESULT=$(infsh app run agent-browser --function execute --session $SESSION --input '{
"code": "JSON.stringify(performance.getEntriesByType(\"resource\").map(r => r.name))"
}')
```
## Security Best Practices
### 1. Never Hardcode Credentials
```bash
# Good: Use environment variables
'{"action": "fill", "ref": "@e2", "text": "'"$PASSWORD"'"}'
# Bad: Hardcoded
'{"action": "fill", "ref": "@e2", "text": "mypassword123"}'
```
### 2. Use Secure Environment Variables
```bash
# Set securely
export PASSWORD=$(cat /path/to/secure/password)
# Or use a secrets manager
export PASSWORD=$(vault read -field=password secret/app)
```
### 3. Don't Log Sensitive Data
```bash
# Good: Redact sensitive info
echo "Logging in as $USERNAME"
# Bad: Logging passwords
echo "Password: $PASSWORD" # Never do this!
```
### 4. Close Sessions After Use
```bash
# Always clean up
trap 'infsh app run agent-browser --function close --session $SESSION --input "{}" 2>/dev/null' EXIT
```
### 5. Use Video Recording for Debugging Only
Video may capture sensitive information:
```bash
# Only enable when debugging
if [ "$DEBUG" = "true" ]; then
RECORD_VIDEO="true"
else
RECORD_VIDEO="false"
fi
```
### 6. Verify Login Success
Always confirm authentication worked:
```bash
# Check URL changed from login page
URL=$(echo $RESULT | jq -r '.url')
if [[ "$URL" == *"/login"* ]] || [[ "$URL" == *"/signin"* ]]; then
echo "ERROR: Login failed"
exit 1
fi
# Or check for specific element on authenticated page
ELEMENTS=$(echo $RESULT | jq -r '.elements_text')
if ! echo "$ELEMENTS" | grep -q "Logout\|Dashboard\|Welcome"; then
echo "ERROR: Not authenticated"
exit 1
fi
```

View file

@ -0,0 +1,272 @@
# Command Reference
Complete reference for all agent-browser functions. For quick start, see [SKILL.md](../SKILL.md).
## Base Command
All commands follow this pattern:
```bash
infsh app run agent-browser --function <function> --session <session_id|new> --input '<json>'
```
- `--function`: Function to call (open, snapshot, interact, screenshot, execute, close)
- `--session`: Session ID from previous call, or `new` to start fresh
- `--input`: JSON input for the function
## Functions
### open
Navigate to URL and configure browser. This is the entry point for all sessions.
```bash
infsh app run agent-browser --function open --session new --input '{
"url": "https://example.com",
"width": 1280,
"height": 720,
"user_agent": "Mozilla/5.0...",
"record_video": false,
"show_cursor": false,
"proxy_url": null,
"proxy_username": null,
"proxy_password": null
}'
```
**Input Fields:**
| Field | Type | Default | Description |
|-------|------|---------|-------------|
| `url` | string | required | URL to navigate to |
| `width` | int | 1280 | Viewport width in pixels |
| `height` | int | 720 | Viewport height in pixels |
| `user_agent` | string | null | Custom user agent string |
| `record_video` | bool | false | Record video (returned on close) |
| `show_cursor` | bool | false | Show cursor indicator in screenshots/video |
| `proxy_url` | string | null | Proxy server URL |
| `proxy_username` | string | null | Proxy auth username |
| `proxy_password` | string | null | Proxy auth password |
**Output:**
```json
{
"session_id": "abc123",
"url": "https://example.com",
"title": "Example Domain",
"elements": [...],
"elements_text": "@e1 [a] \"More information...\" href=\"...\"\n...",
"screenshot": "<File>"
}
```
### snapshot
Re-fetch page state with `@e` refs. Call after navigation or DOM changes.
```bash
infsh app run agent-browser --function snapshot --session $SESSION_ID --input '{}'
```
**Output:** Same as `open` (url, title, elements, elements_text, screenshot)
### interact
Perform actions on the page using `@e` refs.
```bash
infsh app run agent-browser --function interact --session $SESSION_ID --input '{
"action": "click",
"ref": "@e1"
}'
```
**Input Fields:**
| Field | Type | Description |
|-------|------|-------------|
| `action` | string | Action to perform (see Actions table) |
| `ref` | string | Element ref (e.g., `@e1`) |
| `text` | string | Text for fill/type/press/select |
| `direction` | string | Scroll direction: up, down, left, right |
| `scroll_amount` | int | Scroll pixels (default 400) |
| `wait_ms` | int | Wait duration in milliseconds |
| `url` | string | URL for goto action |
| `target_ref` | string | Target ref for drag action |
| `file_paths` | array | File paths for upload action |
**Actions:**
| Action | Required Fields | Description |
|--------|-----------------|-------------|
| `click` | `ref` | Single click |
| `dblclick` | `ref` | Double click |
| `fill` | `ref`, `text` | Clear input and type text |
| `type` | `text` | Type text without clearing |
| `press` | `text` | Press key (Enter, Tab, Escape, etc.) |
| `select` | `ref`, `text` | Select dropdown option by label |
| `hover` | `ref` | Hover over element |
| `check` | `ref` | Check checkbox |
| `uncheck` | `ref` | Uncheck checkbox |
| `drag` | `ref`, `target_ref` | Drag from ref to target_ref |
| `upload` | `ref`, `file_paths` | Upload files to file input |
| `scroll` | `direction` | Scroll page (optional: `scroll_amount`) |
| `back` | - | Go back in browser history |
| `wait` | `wait_ms` | Wait for specified milliseconds |
| `goto` | `url` | Navigate to different URL |
**Output:**
```json
{
"success": true,
"action": "click",
"message": null,
"screenshot": "<File>",
"snapshot": {
"url": "...",
"title": "...",
"elements": [...],
"elements_text": "..."
}
}
```
### screenshot
Take a screenshot of the current page.
```bash
infsh app run agent-browser --function screenshot --session $SESSION_ID --input '{
"full_page": true
}'
```
**Input Fields:**
| Field | Type | Default | Description |
|-------|------|---------|-------------|
| `full_page` | bool | false | Capture full scrollable page |
**Output:**
```json
{
"screenshot": "<File>",
"width": 1280,
"height": 720
}
```
### execute
Run JavaScript code on the page.
```bash
infsh app run agent-browser --function execute --session $SESSION_ID --input '{
"code": "document.title"
}'
```
**Input Fields:**
| Field | Type | Description |
|-------|------|-------------|
| `code` | string | JavaScript code to execute |
**Output:**
```json
{
"result": "Example Domain",
"error": null,
"screenshot": "<File>"
}
```
**Examples:**
```bash
# Get page title
'{"code": "document.title"}'
# Count elements
'{"code": "document.querySelectorAll(\"a\").length"}'
# Extract text
'{"code": "document.querySelector(\"h1\").textContent"}'
# Get all links
'{"code": "Array.from(document.querySelectorAll(\"a\")).map(a => a.href)"}'
# Scroll to bottom
'{"code": "window.scrollTo(0, document.body.scrollHeight)"}'
# Get computed style
'{"code": "getComputedStyle(document.body).backgroundColor"}'
```
### close
Close the browser session. Returns video if recording was enabled.
```bash
infsh app run agent-browser --function close --session $SESSION_ID --input '{}'
```
**Output:**
```json
{
"success": true,
"video": "<File or null>"
}
```
## Key Combinations
For the `press` action, use these key names:
| Key | Name |
|-----|------|
| Enter | `Enter` |
| Tab | `Tab` |
| Escape | `Escape` |
| Backspace | `Backspace` |
| Delete | `Delete` |
| Arrow keys | `ArrowUp`, `ArrowDown`, `ArrowLeft`, `ArrowRight` |
| Modifiers | `Control`, `Shift`, `Alt`, `Meta` |
**Key combinations:**
```bash
# Ctrl+A (select all)
'{"action": "press", "text": "Control+a"}'
# Ctrl+C (copy)
'{"action": "press", "text": "Control+c"}'
# Shift+Tab (focus previous)
'{"action": "press", "text": "Shift+Tab"}'
```
## Error Handling
When an action fails, `success` is `false` and `message` contains the error:
```json
{
"success": false,
"action": "click",
"message": "Unknown ref: @e99. Run 'snapshot' to get current elements.",
"screenshot": "<File>",
"snapshot": {...}
}
```
Common errors:
- `Unknown ref: @eN` - Ref doesn't exist, re-snapshot needed
- `'text' required for fill action` - Missing required field
- `'target_ref' required for drag action` - Missing drag target
- `Timeout 5000ms exceeded` - Element not found or not clickable

View file

@ -0,0 +1,295 @@
# Proxy Support
Proxy configuration for geo-testing, privacy, and corporate environments.
**Related**: [commands.md](commands.md) for full function reference, [SKILL.md](../SKILL.md) for quick start.
## Contents
- [Basic Proxy Configuration](#basic-proxy-configuration)
- [Authenticated Proxy](#authenticated-proxy)
- [Common Use Cases](#common-use-cases)
- [Proxy Types](#proxy-types)
- [Verifying Proxy Connection](#verifying-proxy-connection)
- [Troubleshooting](#troubleshooting)
- [Best Practices](#best-practices)
## Basic Proxy Configuration
Set proxy when opening a session:
```bash
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://example.com",
"proxy_url": "http://proxy.example.com:8080"
}' | jq -r '.session_id')
```
All traffic for this session routes through the proxy.
## Authenticated Proxy
For proxies requiring username/password:
```bash
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://example.com",
"proxy_url": "http://proxy.example.com:8080",
"proxy_username": "myuser",
"proxy_password": "mypassword"
}' | jq -r '.session_id')
```
## Common Use Cases
### Geo-Location Testing
Test how your site appears from different regions:
```bash
#!/bin/bash
# Test from multiple regions
PROXIES=(
"us|http://us-proxy.example.com:8080"
"eu|http://eu-proxy.example.com:8080"
"asia|http://asia-proxy.example.com:8080"
)
for entry in "${PROXIES[@]}"; do
REGION="${entry%%|*}"
PROXY="${entry##*|}"
echo "Testing from: $REGION"
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://mysite.com",
"proxy_url": "'"$PROXY"'"
}' | jq -r '.session_id')
# Take screenshot
infsh app run agent-browser --function screenshot --session $SESSION --input '{
"full_page": true
}' > "${REGION}-screenshot.json"
# Get page content
RESULT=$(infsh app run agent-browser --function snapshot --session $SESSION --input '{}')
echo $RESULT | jq '.elements_text' > "${REGION}-elements.txt"
infsh app run agent-browser --function close --session $SESSION --input '{}'
done
echo "Geo-testing complete"
```
### Rate Limit Avoidance
Rotate proxies for web scraping:
```bash
#!/bin/bash
# Rotate through proxy list
PROXIES=(
"http://proxy1.example.com:8080"
"http://proxy2.example.com:8080"
"http://proxy3.example.com:8080"
)
URLS=(
"https://site.com/page1"
"https://site.com/page2"
"https://site.com/page3"
)
for i in "${!URLS[@]}"; do
# Rotate proxy
PROXY_INDEX=$((i % ${#PROXIES[@]}))
PROXY="${PROXIES[$PROXY_INDEX]}"
URL="${URLS[$i]}"
echo "Fetching $URL via proxy $((PROXY_INDEX + 1))"
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "'"$URL"'",
"proxy_url": "'"$PROXY"'"
}' | jq -r '.session_id')
# Extract data
RESULT=$(infsh app run agent-browser --function execute --session $SESSION --input '{
"code": "document.body.innerText"
}')
echo $RESULT | jq -r '.result' > "page-$i.txt"
infsh app run agent-browser --function close --session $SESSION --input '{}'
# Polite delay
sleep 1
done
```
### Corporate Network Access
Access sites through corporate proxy:
```bash
# Use corporate proxy for external sites
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://external-vendor.com",
"proxy_url": "http://corpproxy.company.com:8080",
"proxy_username": "'"$CORP_USER"'",
"proxy_password": "'"$CORP_PASS"'"
}' | jq -r '.session_id')
```
### Privacy and Anonymity
Route through privacy-focused proxy:
```bash
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://whatismyip.com",
"proxy_url": "socks5://privacy-proxy.example.com:1080"
}' | jq -r '.session_id')
```
## Proxy Types
### HTTP/HTTPS Proxy
```json
{"proxy_url": "http://proxy.example.com:8080"}
{"proxy_url": "https://proxy.example.com:8080"}
```
### SOCKS5 Proxy
```json
{"proxy_url": "socks5://proxy.example.com:1080"}
```
### With Authentication
```json
{
"proxy_url": "http://proxy.example.com:8080",
"proxy_username": "user",
"proxy_password": "pass"
}
```
## Verifying Proxy Connection
Check that traffic routes through proxy:
```bash
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://httpbin.org/ip",
"proxy_url": "http://proxy.example.com:8080"
}' | jq -r '.session_id')
# Get the IP shown
RESULT=$(infsh app run agent-browser --function execute --session $SESSION --input '{
"code": "document.body.innerText"
}')
echo "IP via proxy: $(echo $RESULT | jq -r '.result')"
infsh app run agent-browser --function close --session $SESSION --input '{}'
```
The IP should be the proxy's IP, not your real IP.
## Troubleshooting
### Connection Failed
```
Error: Failed to open URL: net::ERR_PROXY_CONNECTION_FAILED
```
**Solutions:**
1. Verify proxy URL is correct
2. Check proxy is running and accessible
3. Confirm port is correct
4. Test proxy with curl: `curl -x http://proxy:8080 https://example.com`
### Authentication Failed
```
Error: 407 Proxy Authentication Required
```
**Solutions:**
1. Verify username/password are correct
2. Check if proxy requires different auth method
3. Ensure credentials don't contain special characters that need escaping
### SSL Errors
Some proxies perform SSL inspection. If you see certificate errors:
```bash
# The browser should handle most SSL proxies automatically
# If issues persist, verify proxy SSL certificate is valid
```
### Slow Performance
**Solutions:**
1. Choose proxy closer to target site
2. Use faster proxy provider
3. Reduce number of requests per session
## Best Practices
### 1. Use Environment Variables
```bash
# Good: Credentials in env vars
'{"proxy_url": "'"$PROXY_URL"'", "proxy_username": "'"$PROXY_USER"'"}'
# Bad: Hardcoded
'{"proxy_url": "http://user:pass@proxy.com:8080"}'
```
### 2. Test Proxy Before Automation
```bash
# Verify proxy works
curl -x "$PROXY_URL" https://httpbin.org/ip
```
### 3. Handle Proxy Failures
```bash
# Retry with different proxy on failure
for PROXY in "${PROXIES[@]}"; do
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "'"$URL"'",
"proxy_url": "'"$PROXY"'"
}' 2>&1)
if echo "$SESSION" | jq -e '.session_id' > /dev/null 2>&1; then
SESSION_ID=$(echo $SESSION | jq -r '.session_id')
break
fi
echo "Proxy $PROXY failed, trying next..."
done
```
### 4. Respect Rate Limits
Even with proxies, be a good citizen:
```bash
# Add delays between requests
'{"action": "wait", "wait_ms": 1000}'
```
### 5. Log Proxy Usage
For debugging, log which proxy was used:
```bash
echo "$(date): Using proxy $PROXY for $URL" >> proxy.log
```

View file

@ -0,0 +1,204 @@
# Session Management
Browser sessions for state persistence and parallel browsing.
**Related**: [authentication.md](authentication.md) for login patterns, [SKILL.md](../SKILL.md) for quick start.
## Contents
- [How Sessions Work](#how-sessions-work)
- [Starting a Session](#starting-a-session)
- [Using Session IDs](#using-session-ids)
- [Session State](#session-state)
- [Parallel Sessions](#parallel-sessions)
- [Session Cleanup](#session-cleanup)
- [Best Practices](#best-practices)
## How Sessions Work
Each session maintains an isolated browser context with:
- Cookies
- LocalStorage / SessionStorage
- Browser history
- Page state
- Video recording (if enabled)
Sessions persist across function calls, allowing multi-step workflows.
## Starting a Session
Use `--session new` to create a fresh session:
```bash
RESULT=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://example.com"
}')
SESSION_ID=$(echo $RESULT | jq -r '.session_id')
echo "Session: $SESSION_ID"
```
## Using Session IDs
All subsequent calls use the session ID:
```bash
# Navigate
infsh app run agent-browser --function open --session $SESSION_ID --input '{
"url": "https://example.com/page2"
}'
# Interact
infsh app run agent-browser --function interact --session $SESSION_ID --input '{
"action": "click", "ref": "@e1"
}'
# Screenshot
infsh app run agent-browser --function screenshot --session $SESSION_ID --input '{}'
# Close
infsh app run agent-browser --function close --session $SESSION_ID --input '{}'
```
## Session State
### What Persists
Within a session, these persist across calls:
- Cookies (login state, preferences)
- LocalStorage and SessionStorage
- IndexedDB data
- Browser history (for back/forward)
- Current page and DOM state
- Video recording buffer
### What Doesn't Persist
- Sessions don't persist across server restarts
- No automatic session recovery
- Video is only available until close is called
## Parallel Sessions
Run multiple independent sessions simultaneously:
```bash
#!/bin/bash
# Scrape multiple sites in parallel
# Start sessions
RESULT1=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://site1.com"
}')
SESSION1=$(echo $RESULT1 | jq -r '.session_id')
RESULT2=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://site2.com"
}')
SESSION2=$(echo $RESULT2 | jq -r '.session_id')
# Work with each session independently
infsh app run agent-browser --function screenshot --session $SESSION1 --input '{}' &
infsh app run agent-browser --function screenshot --session $SESSION2 --input '{}' &
wait
# Clean up both
infsh app run agent-browser --function close --session $SESSION1 --input '{}'
infsh app run agent-browser --function close --session $SESSION2 --input '{}'
```
### Use Cases for Parallel Sessions
1. **A/B Testing** - Compare different pages or user experiences
2. **Multi-site scraping** - Gather data from multiple sources
3. **Load testing** - Simulate multiple users
4. **Cross-region testing** - Use different proxies per session
## Session Cleanup
Always close sessions when done:
```bash
infsh app run agent-browser --function close --session $SESSION_ID --input '{}'
```
**Why close matters:**
- Releases server resources
- Returns video recording (if enabled)
- Prevents resource leaks
### Error Handling
```bash
#!/bin/bash
set -e
cleanup() {
infsh app run agent-browser --function close --session $SESSION_ID --input '{}' 2>/dev/null || true
}
trap cleanup EXIT
SESSION_ID=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://example.com"
}' | jq -r '.session_id')
# ... your automation ...
# cleanup runs automatically on exit
```
## Best Practices
### 1. Store Session IDs
```bash
# Good: Store for reuse
SESSION_ID=$(... | jq -r '.session_id')
infsh ... --session $SESSION_ID ...
# Bad: Parse every time
infsh ... --session $(... | jq -r '.session_id') ...
```
### 2. Close Sessions Promptly
Don't leave sessions open longer than needed. Server resources are limited.
### 3. Use Meaningful Variable Names
```bash
# Good: Clear purpose
LOGIN_SESSION=$(...)
SCRAPE_SESSION=$(...)
# Bad: Generic names
S1=$(...)
S2=$(...)
```
### 4. Handle Session Expiry
Sessions may expire after extended inactivity:
```bash
# Check if session is still valid
RESULT=$(infsh app run agent-browser --function snapshot --session $SESSION_ID --input '{}' 2>&1)
if echo "$RESULT" | grep -q "session not found"; then
echo "Session expired, starting new one"
SESSION_ID=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://example.com"
}' | jq -r '.session_id')
fi
```
### 5. One Task Per Session
For clarity, use one session per logical task:
```bash
# Good: Separate sessions for separate tasks
LOGIN_SESSION=$(...) # Handle login
SCRAPE_SESSION=$(...) # Handle scraping
# Okay for related tasks: One session for a workflow
SESSION=$(...)
# login -> navigate -> extract -> close
```

View file

@ -0,0 +1,251 @@
# Snapshot and Refs
Compact element references that reduce context usage for AI agents.
**Related**: [commands.md](commands.md) for full function reference, [SKILL.md](../SKILL.md) for quick start.
## Contents
- [How Refs Work](#how-refs-work)
- [Snapshot Output Format](#snapshot-output-format)
- [Using Refs](#using-refs)
- [Ref Lifecycle](#ref-lifecycle)
- [Best Practices](#best-practices)
- [Ref Notation Details](#ref-notation-details)
- [Troubleshooting](#troubleshooting)
## How Refs Work
Traditional approach:
```
Full DOM/HTML -> AI parses -> CSS selector -> Action (~3000-5000 tokens)
```
agent-browser approach:
```
Compact snapshot -> @refs assigned -> Direct interaction (~200-400 tokens)
```
The snapshot extracts interactive elements and assigns short `@e` refs, reducing token usage significantly.
## Snapshot Output Format
```bash
infsh app run agent-browser --function snapshot --session $SESSION --input '{}'
```
**Response `elements_text`:**
```
@e1 [a] "Home" href="/"
@e2 [a] "Products" href="/products"
@e3 [a] "About" href="/about"
@e4 [button] "Sign In"
@e5 [input type="email"] placeholder="Email"
@e6 [input type="password"] placeholder="Password"
@e7 [button type="submit"] "Log In"
@e8 [input type="checkbox"] name="remember"
```
**Response `elements` (structured):**
```json
[
{
"ref": "@e1",
"desc": "@e1 [a] \"Home\" href=\"/\"",
"tag": "a",
"text": "Home",
"role": null,
"name": null,
"href": "/",
"input_type": null
},
...
]
```
## Using Refs
Once you have refs, interact directly:
```bash
# Click the "Sign In" button
'{"action": "click", "ref": "@e4"}'
# Fill email input
'{"action": "fill", "ref": "@e5", "text": "user@example.com"}'
# Fill password
'{"action": "fill", "ref": "@e6", "text": "password123"}'
# Submit the form
'{"action": "click", "ref": "@e7"}'
# Check the "remember me" checkbox
'{"action": "check", "ref": "@e8"}'
```
## Ref Lifecycle
**IMPORTANT**: Refs are invalidated when the page changes!
```bash
# Get initial snapshot
infsh app run agent-browser --function snapshot --session $SESSION --input '{}'
# @e1 [button] "Next"
# Click triggers page change
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "click", "ref": "@e1"
}'
# MUST re-snapshot to get new refs!
infsh app run agent-browser --function snapshot --session $SESSION --input '{}'
# @e1 [h1] "Page 2" <- Different element now!
```
### When to Re-snapshot
Always re-snapshot after:
1. **Navigation** - Clicking links, form submissions, `goto` action
2. **Dynamic content** - AJAX loads, modals opening, tabs switching
3. **Page mutations** - JavaScript modifying the DOM
The `interact` function returns a fresh snapshot in its response, so you can often use that instead of a separate snapshot call.
## Best Practices
### 1. Always Use the Latest Snapshot
```bash
# CORRECT: Use snapshot from previous response
RESULT=$(infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "click", "ref": "@e1"
}')
# Use elements from $RESULT.snapshot for next action
# WRONG: Using stale refs
# After navigation, @e1 may point to a completely different element
```
### 2. Check Success Before Continuing
```bash
RESULT=$(infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "click", "ref": "@e5"
}')
SUCCESS=$(echo $RESULT | jq -r '.success')
if [ "$SUCCESS" != "true" ]; then
echo "Click failed: $(echo $RESULT | jq -r '.message')"
# Re-snapshot and retry
fi
```
### 3. Use elements_text for Quick Decisions
For AI agents, `elements_text` provides a compact text representation:
```
@e1 [input type="email"] placeholder="Email"
@e2 [input type="password"] placeholder="Password"
@e3 [button] "Submit"
```
This is often enough to decide which element to interact with without parsing the full `elements` array.
## Ref Notation Details
```
@e1 [tag type="value"] "text content" name="attr"
| | | | |
| | | | +- Additional attributes
| | | +- Visible text
| | +- Key attributes shown
| +- HTML tag name
+- Unique ref ID
```
### Common Patterns
```
@e1 [button] "Submit" # Button with text
@e2 [input type="email"] # Email input
@e3 [input type="password"] # Password input
@e4 [a] "Link Text" href="/page" # Anchor link
@e5 [select] # Dropdown
@e6 [textarea] placeholder="Message" # Text area
@e7 [input type="file"] # File upload
@e8 [input type="checkbox"] checked # Checked checkbox
@e9 [input type="radio"] selected # Selected radio
@e10 [button type="submit"] "Send" # Submit button
```
### Elements Captured
The snapshot captures these interactive elements:
- Links (`<a href>`)
- Buttons (`<button>`, `[role="button"]`)
- Inputs (`<input>`, `<textarea>`, `<select>`)
- Clickable elements (`[onclick]`, `[tabindex]`)
- ARIA roles (`[role="link"]`, `[role="checkbox"]`, etc.)
Non-interactive or hidden elements are filtered out.
## Troubleshooting
### "Unknown ref" Error
```json
{
"success": false,
"message": "Unknown ref: @e15. Run 'snapshot' to get current elements."
}
```
**Solution**: Re-snapshot. The page changed and refs are stale.
```bash
infsh app run agent-browser --function snapshot --session $SESSION --input '{}'
# Now use the new refs
```
### Element Not in Snapshot
The element you need might not appear because:
1. **Not visible** - Scroll to reveal it
```bash
'{"action": "scroll", "direction": "down", "scroll_amount": 500}'
```
2. **Not interactive** - Use JavaScript to interact
```bash
'{"code": "document.querySelector(\".hidden-btn\").click()"}'
```
3. **In iframe** - Currently not supported (use `execute` with JS)
4. **Dynamic** - Wait for it to load
```bash
'{"action": "wait", "wait_ms": 2000}'
```
### Too Many Elements
Snapshots are limited to 50 elements. If the page has more:
1. **Scroll** to bring relevant elements into view
2. **Use JavaScript** to target specific elements
3. **Navigate** to a more specific page
### Ref Points to Wrong Element
If a ref seems to interact with the wrong element:
1. Re-snapshot to get fresh refs
2. Check if the page structure changed
3. Verify with screenshot that the right element is targeted

View file

@ -0,0 +1,286 @@
# Video Recording
Capture browser automation as video for debugging, documentation, or verification.
**Related**: [commands.md](commands.md) for full function reference, [SKILL.md](../SKILL.md) for quick start.
## Contents
- [Basic Recording](#basic-recording)
- [Cursor Indicator](#cursor-indicator)
- [How Recording Works](#how-recording-works)
- [Use Cases](#use-cases)
- [Best Practices](#best-practices)
- [Output Format](#output-format)
- [Limitations](#limitations)
## Basic Recording
Enable video recording when opening a session:
```bash
# Start with recording enabled
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://example.com",
"record_video": true
}' | jq -r '.session_id')
# Perform actions
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "click", "ref": "@e1"
}'
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "fill", "ref": "@e2", "text": "test input"
}'
# Close to get the video
RESULT=$(infsh app run agent-browser --function close --session $SESSION --input '{}')
VIDEO=$(echo $RESULT | jq -r '.video')
echo "Video file: $VIDEO"
```
## Cursor Indicator
For demos and documentation, show a visible cursor that follows mouse movements:
```bash
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://example.com",
"record_video": true,
"show_cursor": true
}' | jq -r '.session_id')
```
The cursor appears as a red dot that:
- Follows mouse movements in real-time
- Shows click feedback (shrinks on mousedown)
- Persists across page navigations
- Appears in both screenshots and video
This is especially useful for:
- Tutorial/documentation videos
- Debugging interaction issues
- Sharing recordings with non-technical stakeholders
## How Recording Works
1. **Start**: Pass `"record_video": true` in the `open` function
2. **Record**: All browser activity is captured throughout the session
3. **Stop**: Video is finalized when `close` is called
4. **Retrieve**: Video file is returned in the `close` response
The video captures:
- Page loads and navigations
- Element interactions (clicks, typing)
- Scrolling and animations
- Dynamic content changes
## Use Cases
### Debugging Failed Automation
```bash
#!/bin/bash
# Record automation for debugging
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://app.example.com",
"record_video": true
}' | jq -r '.session_id')
# Run automation
RESULT=$(infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "click", "ref": "@e1"
}')
SUCCESS=$(echo $RESULT | jq -r '.success')
if [ "$SUCCESS" != "true" ]; then
echo "Action failed!"
echo "Message: $(echo $RESULT | jq -r '.message')"
# Get video for debugging
CLOSE_RESULT=$(infsh app run agent-browser --function close --session $SESSION --input '{}')
echo "Debug video: $(echo $CLOSE_RESULT | jq -r '.video')"
exit 1
fi
infsh app run agent-browser --function close --session $SESSION --input '{}'
```
### Documentation Generation
Record workflows for user documentation:
```bash
#!/bin/bash
# Record how-to video
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://app.example.com/settings",
"record_video": true,
"width": 1920,
"height": 1080
}' | jq -r '.session_id')
# Add pauses for clarity
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "wait", "wait_ms": 1000
}'
# Step 1: Click settings
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "click", "ref": "@e5"
}'
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "wait", "wait_ms": 500
}'
# Step 2: Change setting
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "click", "ref": "@e10"
}'
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "wait", "wait_ms": 500
}'
# Step 3: Save
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "click", "ref": "@e15"
}'
infsh app run agent-browser --function interact --session $SESSION --input '{
"action": "wait", "wait_ms": 1000
}'
# Get the video
RESULT=$(infsh app run agent-browser --function close --session $SESSION --input '{}')
echo "Documentation video: $(echo $RESULT | jq -r '.video')"
```
### Test Evidence for CI/CD
```bash
#!/bin/bash
# Record E2E test for CI artifacts
TEST_NAME="${1:-e2e-test}"
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "'"$TEST_URL"'",
"record_video": true
}' | jq -r '.session_id')
# Run test steps
run_test_steps $SESSION
TEST_RESULT=$?
# Always get video
CLOSE_RESULT=$(infsh app run agent-browser --function close --session $SESSION --input '{}')
VIDEO=$(echo $CLOSE_RESULT | jq -r '.video')
# Save to artifacts
if [ -n "$CI_ARTIFACTS_DIR" ]; then
cp "$VIDEO" "$CI_ARTIFACTS_DIR/${TEST_NAME}.webm"
fi
exit $TEST_RESULT
```
### Monitoring and Auditing
```bash
#!/bin/bash
# Record automated task for audit trail
TASK_ID=$(date +%Y%m%d-%H%M%S)
SESSION=$(infsh app run agent-browser --function open --session new --input '{
"url": "https://admin.example.com",
"record_video": true
}' | jq -r '.session_id')
# Perform admin task
# ... automation steps ...
# Save recording
RESULT=$(infsh app run agent-browser --function close --session $SESSION --input '{}')
VIDEO=$(echo $RESULT | jq -r '.video')
# Archive for audit
mv "$VIDEO" "/audit/recordings/${TASK_ID}.webm"
echo "Audit recording saved: ${TASK_ID}.webm"
```
## Best Practices
### 1. Add Strategic Pauses
Pauses make videos easier to follow:
```bash
# After significant actions, add a pause
'{"action": "click", "ref": "@e1"}'
'{"action": "wait", "wait_ms": 500}' # Let viewer see result
```
### 2. Use Larger Viewport for Documentation
```bash
'{"url": "...", "record_video": true, "width": 1920, "height": 1080}'
```
### 3. Handle Errors Gracefully
Always retrieve video even on failure:
```bash
cleanup() {
if [ -n "$SESSION" ]; then
infsh app run agent-browser --function close --session $SESSION --input '{}' 2>/dev/null
fi
}
trap cleanup EXIT
```
### 4. Combine with Screenshots
Use screenshots for key frames, video for flow:
```bash
# Record overall flow
'{"record_video": true}'
# Capture key states
infsh app run agent-browser --function screenshot --session $SESSION --input '{
"full_page": true
}'
```
### 5. Don't Record Sensitive Sessions
Avoid recording when handling credentials:
```bash
if [ "$CONTAINS_SENSITIVE_DATA" = "true" ]; then
RECORD="false"
else
RECORD="true"
fi
'{"url": "...", "record_video": '$RECORD'}'
```
## Output Format
- **Format**: WebM (VP8/VP9 codec)
- **Compatibility**: All modern browsers and video players
- **Quality**: Matches viewport size
- **Compression**: Efficient for screen content
## Limitations
1. **Session-level only** - Can't start/stop mid-session
2. **Memory usage** - Long sessions consume more memory
3. **File size** - Complex pages with animations produce larger files
4. **No audio** - Browser audio is not captured
5. **Returned on close** - Video only available after session ends