--- title: "Showcase: Real-World Examples" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Showcase: Real-World Examples} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE ) library(putior) ``` This showcase demonstrates putior diagrams at different scales, from simple workflows to complex multi-file pipelines. ## Small Workflows (3-5 nodes) Perfect for single-purpose scripts or focused analysis tasks. ### Example: Simple ETL Pipeline A basic extract-transform-load workflow: ```r # 01_extract.R # put label:"Extract Data", node_type:"input", output:"raw_data.csv" # 02_transform.R # put label:"Transform Data", input:"raw_data.csv", output:"clean_data.csv" # 03_load.R # put label:"Load to Database", node_type:"output", input:"clean_data.csv" ``` **Generated Diagram:** ```{r etl-diagram, echo=FALSE, results='asis', eval=TRUE} etl_workflow <- data.frame( file_name = c("01_extract.R", "02_transform.R", "03_load.R"), id = c("extract", "transform", "load"), label = c("Extract Data", "Transform Data", "Load to Database"), node_type = c("input", "process", "output"), input = c(NA, "raw_data.csv", "clean_data.csv"), output = c("raw_data.csv", "clean_data.csv", NA), stringsAsFactors = FALSE ) cat("```mermaid\n") cat(put_diagram(etl_workflow, theme = "github", output = "raw")) cat("\n```\n") ``` ### Example: Report Generation A simple report generation workflow: ```r # fetch_metrics.R # put label:"Fetch Metrics", node_type:"input", output:"metrics.json" # analyze.R # put label:"Analyze Trends", input:"metrics.json", output:"analysis.rds" # report.R # put label:"Generate Report", node_type:"output", input:"analysis.rds", output:"report.html" ``` **Generated Diagram:** ```{r report-diagram, echo=FALSE, results='asis', eval=TRUE} report_workflow <- data.frame( file_name = c("fetch_metrics.R", "analyze.R", "report.R"), id = c("fetch", "analyze", "report"), label = c("Fetch Metrics", "Analyze Trends", "Generate Report"), node_type = c("input", "process", "output"), input = c(NA, "metrics.json", "analysis.rds"), output = c("metrics.json", "analysis.rds", "report.html"), stringsAsFactors = FALSE ) cat("```mermaid\n") cat(put_diagram(report_workflow, theme = "github", output = "raw")) cat("\n```\n") ``` ## Medium Workflows (10-15 nodes) Suitable for typical data science projects with multiple processing stages. ### Example: Machine Learning Pipeline A complete ML workflow from data collection to model deployment: ```r # 01_collect_data.py # put label:"Collect Raw Data", node_type:"input", output:"raw_data.csv" # 02_clean_data.R # put label:"Clean Data", input:"raw_data.csv", output:"clean_data.csv" # 03_feature_eng.R # put label:"Feature Engineering", input:"clean_data.csv", output:"features.csv" # 04_split_data.R # put label:"Train/Test Split", input:"features.csv", output:"train.csv, test.csv" # 05_train_model.py # put label:"Train Model", input:"train.csv", output:"model.pkl" # 06_evaluate.py # put label:"Evaluate Model", input:"model.pkl, test.csv", output:"metrics.json" # 07_hyperparameter.py # put label:"Hyperparameter Tuning", input:"train.csv", output:"best_params.json" # 08_retrain.py # put label:"Retrain with Best Params", input:"train.csv, best_params.json", output:"final_model.pkl" # 09_validate.R # put label:"Final Validation", input:"final_model.pkl, test.csv", output:"validation_report.html" # 10_deploy.sh # put label:"Deploy Model", node_type:"output", input:"final_model.pkl, validation_report.html" ``` **Generated Diagram:** ```{r ml-diagram, echo=FALSE, results='asis', eval=TRUE} ml_workflow <- data.frame( file_name = c("01_collect_data.py", "02_clean_data.R", "03_feature_eng.R", "04_split_data.R", "05_train_model.py", "06_evaluate.py", "07_hyperparameter.py", "08_retrain.py", "09_validate.R", "10_deploy.sh"), id = c("collect", "clean", "feature", "split", "train", "evaluate", "hyper", "retrain", "validate", "deploy"), label = c("Collect Raw Data", "Clean Data", "Feature Engineering", "Train/Test Split", "Train Model", "Evaluate Model", "Hyperparameter Tuning", "Retrain with Best Params", "Final Validation", "Deploy Model"), node_type = c("input", "process", "process", "process", "process", "process", "process", "process", "process", "output"), input = c(NA, "raw_data.csv", "clean_data.csv", "features.csv", "train.csv", "model.pkl,test.csv", "train.csv", "train.csv,best_params.json", "final_model.pkl,test.csv", "final_model.pkl,validation_report.html"), output = c("raw_data.csv", "clean_data.csv", "features.csv", "train.csv,test.csv", "model.pkl", "metrics.json", "best_params.json", "final_model.pkl", "validation_report.html", NA), stringsAsFactors = FALSE ) cat("```mermaid\n") cat(put_diagram(ml_workflow, theme = "github", output = "raw")) cat("\n```\n") ``` ### Example: Multi-Source Data Integration Combining data from multiple sources: ```r # sources/fetch_sales.R # put label:"Fetch Sales API", node_type:"input", output:"sales_raw.json" # sources/fetch_inventory.R # put label:"Fetch Inventory DB", node_type:"input", output:"inventory_raw.csv" # sources/fetch_customers.py # put label:"Fetch Customer CRM", node_type:"input", output:"customers_raw.csv" # transform/clean_sales.R # put label:"Clean Sales", input:"sales_raw.json", output:"sales_clean.csv" # transform/clean_inventory.R # put label:"Clean Inventory", input:"inventory_raw.csv", output:"inventory_clean.csv" # transform/clean_customers.R # put label:"Clean Customers", input:"customers_raw.csv", output:"customers_clean.csv" # integrate/merge_data.R # put label:"Merge All Sources", input:"sales_clean.csv, inventory_clean.csv, customers_clean.csv", output:"integrated_data.csv" # analyze/business_metrics.R # put label:"Calculate Metrics", input:"integrated_data.csv", output:"metrics.rds" # report/dashboard.R # put label:"Generate Dashboard", node_type:"output", input:"metrics.rds", output:"dashboard.html" ``` **Generated Diagram:** ```{r integration-diagram, echo=FALSE, results='asis', eval=TRUE} integration_workflow <- data.frame( file_name = c("sources/fetch_sales.R", "sources/fetch_inventory.R", "sources/fetch_customers.py", "transform/clean_sales.R", "transform/clean_inventory.R", "transform/clean_customers.R", "integrate/merge_data.R", "analyze/business_metrics.R", "report/dashboard.R"), id = c("sales_api", "inv_db", "cust_crm", "clean_sales", "clean_inv", "clean_cust", "merge", "metrics", "dashboard"), label = c("Fetch Sales API", "Fetch Inventory DB", "Fetch Customer CRM", "Clean Sales", "Clean Inventory", "Clean Customers", "Merge All Sources", "Calculate Metrics", "Generate Dashboard"), node_type = c("input", "input", "input", "process", "process", "process", "process", "process", "output"), input = c(NA, NA, NA, "sales_raw.json", "inventory_raw.csv", "customers_raw.csv", "sales_clean.csv,inventory_clean.csv,customers_clean.csv", "integrated_data.csv", "metrics.rds"), output = c("sales_raw.json", "inventory_raw.csv", "customers_raw.csv", "sales_clean.csv", "inventory_clean.csv", "customers_clean.csv", "integrated_data.csv", "metrics.rds", "dashboard.html"), stringsAsFactors = FALSE ) cat("```mermaid\n") cat(put_diagram(integration_workflow, theme = "github", output = "raw")) cat("\n```\n") ``` ## Large Workflows (20+ nodes) For enterprise-scale data pipelines and complex analysis systems. ### Example: Complete Analytics Platform A full analytics platform with multiple parallel processing streams. *Note: This complex subgraph diagram uses advanced Mermaid features (named subgraphs) that `put_diagram()` doesn't generate natively. For enterprise workflows with complex groupings, you can combine putior-generated diagrams with hand-crafted Mermaid extensions.* ```mermaid flowchart TD subgraph Data_Sources [Data Sources] web_logs([Web Logs]) app_events([App Events]) crm_data([CRM Data]) finance_data([Finance Data]) end subgraph Ingestion [Ingestion Layer] parse_logs[Parse Web Logs] parse_events[Parse App Events] extract_crm[Extract CRM] extract_finance[Extract Finance] end subgraph Transformation [Transformation Layer] clean_logs[Clean Logs] clean_events[Clean Events] clean_crm[Clean CRM] clean_finance[Clean Finance] enrich_logs[Enrich with Geo] enrich_events[Add Session Info] join_customer[Join Customer Data] end subgraph Analytics [Analytics Layer] user_behavior[User Behavior Analysis] conversion_funnel[Conversion Funnel] revenue_analysis[Revenue Analysis] cohort_analysis[Cohort Analysis] ab_testing[A/B Test Results] end subgraph Output [Output Layer] exec_dashboard[[Executive Dashboard]] marketing_report[[Marketing Report]] finance_report[[Finance Report]] data_warehouse[[Data Warehouse]] end web_logs --> parse_logs app_events --> parse_events crm_data --> extract_crm finance_data --> extract_finance parse_logs --> clean_logs parse_events --> clean_events extract_crm --> clean_crm extract_finance --> clean_finance clean_logs --> enrich_logs clean_events --> enrich_events clean_crm --> join_customer clean_finance --> revenue_analysis enrich_logs --> user_behavior enrich_events --> user_behavior enrich_events --> conversion_funnel join_customer --> cohort_analysis join_customer --> ab_testing user_behavior --> exec_dashboard conversion_funnel --> marketing_report revenue_analysis --> finance_report cohort_analysis --> exec_dashboard ab_testing --> marketing_report user_behavior --> data_warehouse revenue_analysis --> data_warehouse cohort_analysis --> data_warehouse classDef inputStyle fill:#dbeafe,stroke:#2563eb,stroke-width:2px,color:#1e40af classDef processStyle fill:#ede9fe,stroke:#7c3aed,stroke-width:2px,color:#5b21b6 classDef outputStyle fill:#dcfce7,stroke:#16a34a,stroke-width:2px,color:#15803d class web_logs,app_events,crm_data,finance_data inputStyle class parse_logs,parse_events,extract_crm,extract_finance processStyle class clean_logs,clean_events,clean_crm,clean_finance processStyle class enrich_logs,enrich_events,join_customer processStyle class user_behavior,conversion_funnel,revenue_analysis,cohort_analysis,ab_testing processStyle class exec_dashboard,marketing_report,finance_report,data_warehouse outputStyle ``` ## Multi-Language Workflows putior excels at documenting polyglot data pipelines with **automatic comment syntax detection** for 30+ languages. ### Language-Specific Comment Syntax | Comment Style | Languages | Example | |---------------|-----------|---------| | `# put` | R, Python, Shell, Julia, Ruby, YAML | `# put label:"Process Data"` | | `-- put` | SQL, Lua, Haskell | `-- put label:"Query Database"` | | `// put` | JavaScript, TypeScript, C, Go, Rust, Java | `// put label:"Transform JSON"` | | `% put` | MATLAB, LaTeX | `% put label:"Compute Matrix"` | ### JavaScript/TypeScript Example ```javascript // api_handler.js // put label:"API Handler", node_type:"input", output:"api_response.json" const response = await fetch('/api/data'); const data = await response.json(); // put label:"Data Validation", input:"api_response.json", output:"validated.json" const validated = validateSchema(data); ``` ### Go Example ```go // processor.go // put label:"Data Processor", input:"input.json", output:"output.json" func ProcessData(input []byte) ([]byte, error) { // Processing logic } ``` ### MATLAB Example ```matlab % signal_analysis.m % put label:"Signal Processing", node_type:"input", output:"signal_data.mat" data = load('raw_signal.mat'); % put label:"FFT Analysis", input:"signal_data.mat", output:"frequency_spectrum.mat" spectrum = fft(data.signal); ``` ### Example: R + Python + SQL Pipeline Each language uses its native comment syntax: ```sql -- extract.sql (SQL uses -- comments) -- put label:"SQL Extract", node_type:"input", output:"raw_query_results.csv" SELECT * FROM sales WHERE date > '2024-01-01'; ``` ```python # transform.py (Python uses # comments) # put label:"Python Transform", input:"raw_query_results.csv", output:"transformed.parquet" import pandas as pd df = pd.read_csv("raw_query_results.csv") ``` ```r # analyze.R (R uses # comments) # put label:"R Statistical Analysis", input:"transformed.parquet", output:"stats.rds" library(arrow) data <- read_parquet("transformed.parquet") ``` ```r # visualize.R # put label:"R Visualization", input:"stats.rds", output:"plots.pdf" # report.py # put label:"Python Report Gen", node_type:"output", input:"stats.rds, plots.pdf", output:"final_report.html" ``` **Generated Diagram:** ```{r multilang-diagram, echo=FALSE, results='asis', eval=TRUE} multilang_workflow <- data.frame( file_name = c("extract.sql", "transform.py", "analyze.R", "visualize.R", "report.py"), id = c("sql", "python_transform", "r_stats", "r_viz", "python_report"), label = c("SQL Extract", "Python Transform", "R Statistical Analysis", "R Visualization", "Python Report Gen"), node_type = c("input", "process", "process", "process", "output"), input = c(NA, "raw_query_results.csv", "transformed.parquet", "stats.rds", "stats.rds,plots.pdf"), output = c("raw_query_results.csv", "transformed.parquet", "stats.rds", "plots.pdf", "final_report.html"), stringsAsFactors = FALSE ) cat("```mermaid\n") cat(put_diagram(multilang_workflow, theme = "github", output = "raw")) cat("\n```\n") ``` ## Domain-Specific Examples Real-world workflows from various data science domains. ### Bioinformatics Pipeline A genomics analysis workflow processing FASTA sequences: ```r # sequences/fetch_sequences.R # put label:"Fetch FASTA Sequences", node_type:"input", output:"raw_sequences.fasta" # sequences/quality_control.py # put label:"Quality Control", input:"raw_sequences.fasta", output:"filtered_sequences.fasta, qc_report.html" # alignment/run_blast.sh # put label:"BLAST Alignment", input:"filtered_sequences.fasta", output:"blast_results.xml" # alignment/parse_blast.R # put label:"Parse BLAST Results", input:"blast_results.xml", output:"alignments.csv" # analysis/differential_expression.R # put label:"Differential Expression", input:"alignments.csv", output:"de_results.rds" # analysis/pathway_analysis.R # put label:"Pathway Enrichment", input:"de_results.rds", output:"pathways.csv" # report/bioinformatics_report.R # put label:"Generate Report", node_type:"output", input:"de_results.rds, pathways.csv, qc_report.html", output:"analysis_report.html" ``` **Generated Diagram:** ```{r bio-diagram, echo=FALSE, results='asis', eval=TRUE} bio_workflow <- data.frame( file_name = c("sequences/fetch_sequences.R", "sequences/quality_control.py", "alignment/run_blast.sh", "alignment/parse_blast.R", "analysis/differential_expression.R", "analysis/pathway_analysis.R", "report/bioinformatics_report.R"), id = c("fetch", "qc", "blast", "parse", "de", "pathway", "report"), label = c("Fetch FASTA Sequences", "Quality Control", "BLAST Alignment", "Parse BLAST Results", "Differential Expression", "Pathway Enrichment", "Generate Report"), node_type = c("input", "process", "process", "process", "process", "process", "output"), input = c(NA, "raw_sequences.fasta", "filtered_sequences.fasta", "blast_results.xml", "alignments.csv", "de_results.rds", "de_results.rds,pathways.csv,qc_report.html"), output = c("raw_sequences.fasta", "filtered_sequences.fasta,qc_report.html", "blast_results.xml", "alignments.csv", "de_results.rds", "pathways.csv", "analysis_report.html"), stringsAsFactors = FALSE ) cat("```mermaid\n") cat(put_diagram(bio_workflow, theme = "github", output = "raw")) cat("\n```\n") ``` ### Financial Analysis Pipeline Portfolio analysis and risk assessment workflow: ```r # data/fetch_market_data.py # put label:"Fetch Market Data", node_type:"input", output:"market_prices.parquet" # data/fetch_holdings.R # put label:"Load Portfolio Holdings", node_type:"input", output:"holdings.csv" # analysis/calculate_returns.R # put label:"Calculate Returns", input:"market_prices.parquet, holdings.csv", output:"returns.rds" # analysis/risk_metrics.R # put label:"Compute Risk Metrics", input:"returns.rds", output:"var_results.rds, sharpe_ratios.csv" # analysis/attribution.py # put label:"Performance Attribution", input:"returns.rds, holdings.csv", output:"attribution.json" # optimization/portfolio_optimize.R # put label:"Portfolio Optimization", input:"returns.rds, var_results.rds", output:"optimal_weights.csv" # report/risk_dashboard.R # put label:"Risk Dashboard", node_type:"output", input:"var_results.rds, sharpe_ratios.csv, attribution.json, optimal_weights.csv", output:"risk_report.html" ``` **Generated Diagram:** ```{r finance-diagram, echo=FALSE, results='asis', eval=TRUE} finance_workflow <- data.frame( file_name = c("data/fetch_market_data.py", "data/fetch_holdings.R", "analysis/calculate_returns.R", "analysis/risk_metrics.R", "analysis/attribution.py", "optimization/portfolio_optimize.R", "report/risk_dashboard.R"), id = c("market", "holdings", "returns", "risk", "attrib", "optimize", "dashboard"), label = c("Fetch Market Data", "Load Portfolio Holdings", "Calculate Returns", "Compute Risk Metrics", "Performance Attribution", "Portfolio Optimization", "Risk Dashboard"), node_type = c("input", "input", "process", "process", "process", "process", "output"), input = c(NA, NA, "market_prices.parquet,holdings.csv", "returns.rds", "returns.rds,holdings.csv", "returns.rds,var_results.rds", "var_results.rds,sharpe_ratios.csv,attribution.json,optimal_weights.csv"), output = c("market_prices.parquet", "holdings.csv", "returns.rds", "var_results.rds,sharpe_ratios.csv", "attribution.json", "optimal_weights.csv", "risk_report.html"), stringsAsFactors = FALSE ) cat("```mermaid\n") cat(put_diagram(finance_workflow, theme = "github", output = "raw")) cat("\n```\n") ``` ### Web Scraping Pipeline Data extraction from web sources: ```r # scrape/fetch_urls.py # put label:"Fetch URL List", node_type:"input", output:"target_urls.txt" # scrape/scrape_pages.py # put label:"Scrape Web Pages", input:"target_urls.txt", output:"raw_html.json" # extract/parse_html.py # put label:"Parse HTML Content", input:"raw_html.json", output:"extracted_text.json" # extract/extract_entities.py # put label:"Named Entity Recognition", input:"extracted_text.json", output:"entities.csv" # transform/clean_data.R # put label:"Clean and Normalize", input:"entities.csv", output:"clean_entities.csv" # transform/deduplicate.R # put label:"Remove Duplicates", input:"clean_entities.csv", output:"unique_entities.csv" # load/save_to_db.py # put label:"Load to Database", node_type:"output", input:"unique_entities.csv" ``` **Generated Diagram:** ```{r scraping-diagram, echo=FALSE, results='asis', eval=TRUE} scraping_workflow <- data.frame( file_name = c("scrape/fetch_urls.py", "scrape/scrape_pages.py", "extract/parse_html.py", "extract/extract_entities.py", "transform/clean_data.R", "transform/deduplicate.R", "load/save_to_db.py"), id = c("urls", "scrape", "parse", "ner", "clean", "dedup", "db"), label = c("Fetch URL List", "Scrape Web Pages", "Parse HTML Content", "Named Entity Recognition", "Clean and Normalize", "Remove Duplicates", "Load to Database"), node_type = c("input", "process", "process", "process", "process", "process", "output"), input = c(NA, "target_urls.txt", "raw_html.json", "extracted_text.json", "entities.csv", "clean_entities.csv", "unique_entities.csv"), output = c("target_urls.txt", "raw_html.json", "extracted_text.json", "entities.csv", "clean_entities.csv", "unique_entities.csv", NA), stringsAsFactors = FALSE ) cat("```mermaid\n") cat(put_diagram(scraping_workflow, theme = "github", output = "raw")) cat("\n```\n") ``` ### Multi-Language ML Pipeline A realistic ML workflow using R for data prep, Python for training, and R for reporting: ```r # data/load_raw_data.R # put label:"Load Raw Data (R)", node_type:"input", output:"raw_data.rds" # data/eda_analysis.R # put label:"Exploratory Analysis (R)", input:"raw_data.rds", output:"eda_report.html, data_summary.json" # preprocessing/feature_engineering.R # put label:"Feature Engineering (R)", input:"raw_data.rds, data_summary.json", output:"features.parquet" # preprocessing/split_data.R # put label:"Train/Test Split (R)", input:"features.parquet", output:"train.parquet, test.parquet" # training/train_model.py # put label:"Train XGBoost (Python)", input:"train.parquet", output:"model.pkl, training_metrics.json" # training/hyperparameter_search.py # put label:"Hyperparameter Tuning (Python)", input:"train.parquet", output:"best_params.json" # training/final_model.py # put label:"Final Model Training (Python)", input:"train.parquet, best_params.json", output:"final_model.pkl" # evaluation/model_evaluation.py # put label:"Model Evaluation (Python)", input:"final_model.pkl, test.parquet", output:"predictions.csv, eval_metrics.json" # reporting/model_report.R # put label:"Model Report (R)", node_type:"output", input:"eval_metrics.json, training_metrics.json, eda_report.html", output:"final_report.html" # deployment/export_model.py # put label:"Export for Production (Python)", node_type:"output", input:"final_model.pkl", output:"model_artifact.tar.gz" ``` **Generated Diagram:** ```{r multilang-ml-diagram, echo=FALSE, results='asis', eval=TRUE} multilang_ml_workflow <- data.frame( file_name = c("data/load_raw_data.R", "data/eda_analysis.R", "preprocessing/feature_engineering.R", "preprocessing/split_data.R", "training/train_model.py", "training/hyperparameter_search.py", "training/final_model.py", "evaluation/model_evaluation.py", "reporting/model_report.R", "deployment/export_model.py"), id = c("load", "eda", "features", "split", "train", "hyper", "final", "evaluate", "report", "deploy_export"), label = c("Load Raw Data - R", "Exploratory Analysis - R", "Feature Engineering - R", "Train/Test Split - R", "Train XGBoost - Python", "Hyperparameter Tuning - Python", "Final Model Training - Python", "Model Evaluation - Python", "Model Report - R", "Export for Production - Python"), node_type = c("input", "process", "process", "process", "process", "process", "process", "process", "output", "output"), input = c(NA, "raw_data.rds", "raw_data.rds,data_summary.json", "features.parquet", "train.parquet", "train.parquet", "train.parquet,best_params.json", "final_model.pkl,test.parquet", "eval_metrics.json,training_metrics.json,eda_report.html", "final_model.pkl"), output = c("raw_data.rds", "eda_report.html,data_summary.json", "features.parquet", "train.parquet,test.parquet", "model.pkl,training_metrics.json", "best_params.json", "final_model.pkl", "predictions.csv,eval_metrics.json", "final_report.html", "model_artifact.tar.gz"), stringsAsFactors = FALSE ) cat("```mermaid\n") cat(put_diagram(multilang_ml_workflow, theme = "github", output = "raw")) cat("\n```\n") ``` This example demonstrates: - **R for data handling**: Loading, EDA, feature engineering, splitting - **Python for ML**: XGBoost training, hyperparameter search, evaluation - **R for reporting**: Combining results into a final report - **Python for deployment**: Packaging model artifacts --- ## Improving Existing Annotations Real-world codebases often have messy, incomplete annotations. Here's how to clean them up. ### Before: A Messy Starting Point This ETL script has common problems: ```r # etl_pipeline.R - typical messy annotations # put id:"step1", output:"data" # ^ Problem: Vague ID and output name raw <- read.csv("sales_2024.csv") # put id:"2", input:"data" # ^ Problem: Inconsistent ID style (numeric), output missing clean <- raw[complete.cases(raw), ] clean$date <- as.Date(clean$date) # (No annotation here - important step is undocumented!) aggregated <- aggregate(amount ~ region, clean, sum) # put label:"final step" # ^ Problem: Missing ID, vague label, no input/output write.csv(aggregated, "regional_sales.csv") ``` **Resulting diagram (disconnected, unclear):** ```{r before-diagram, echo=FALSE, results='asis', eval=TRUE} library(putior) before_workflow <- data.frame( file_name = rep("etl_pipeline.R", 3), id = c("step1", "2", "final_step_1"), label = c("step1", "2", "final step"), node_type = c("process", "process", "process"), input = c(NA, "data", NA), output = c("data", NA, NA), stringsAsFactors = FALSE ) cat("```mermaid\n") cat(put_diagram(before_workflow, theme = "github", output = "raw")) cat("\n```\n") ``` ### Step-by-Step Improvement **Step 1: Audit current state** ```{r eval=FALSE} workflow <- put("etl_pipeline.R", validate = TRUE) print(workflow) # See what's detected # Validation warnings will highlight issues ``` **Step 2: Use auto-detection to find gaps** ```{r eval=FALSE} auto <- put_auto("etl_pipeline.R") print(auto) # Shows file I/O that wasn't annotated ``` **Step 3: Generate annotation templates** ```{r eval=FALSE} put_generate("etl_pipeline.R") # Outputs suggested annotations based on code patterns ``` **Step 4: Apply fixes with naming conventions** | Convention | Example | Benefit | |------------|---------|---------| | Descriptive IDs | `extract_sales` not `step1` | Self-documenting | | Verb + noun labels | "Load Sales Data" | Clear action | | Full file names | `sales_2024.csv` not `data` | Traceable | | Consistent style | `snake_case` IDs | Maintainable | ### After: Clean Annotations ```r # etl_pipeline.R - improved annotations # put id:"extract_sales", label:"Load Sales Data", \ # node_type:"input", output:"sales_2024.csv" raw <- read.csv("sales_2024.csv") # put id:"clean_data", label:"Clean & Validate", \ # input:"sales_2024.csv", output:"clean_sales.internal" clean <- raw[complete.cases(raw), ] clean$date <- as.Date(clean$date) # put id:"aggregate_regions", label:"Aggregate by Region", \ # input:"clean_sales.internal", output:"aggregated.internal" aggregated <- aggregate(amount ~ region, clean, sum) # put id:"export_results", label:"Export Regional Report", \ # node_type:"output", input:"aggregated.internal", output:"regional_sales.csv" write.csv(aggregated, "regional_sales.csv") ``` **Resulting diagram (connected, clear flow):** ```{r after-diagram, echo=FALSE, results='asis', eval=TRUE} after_workflow <- data.frame( file_name = rep("etl_pipeline.R", 4), id = c("extract_sales", "clean_data", "aggregate_regions", "export_results"), label = c("Load Sales Data", "Clean & Validate", "Aggregate by Region", "Export Regional Report"), node_type = c("input", "process", "process", "output"), input = c(NA, "sales_2024.csv", "clean_sales.internal", "aggregated.internal"), output = c("sales_2024.csv", "clean_sales.internal", "aggregated.internal", "regional_sales.csv"), stringsAsFactors = FALSE ) cat("```mermaid\n") cat(put_diagram(after_workflow, theme = "github", output = "raw")) cat("\n```\n") ``` ### Key Improvements Made | Before | After | Why | |--------|-------|-----| | `id:"step1"` | `id:"extract_sales"` | Descriptive, searchable | | `output:"data"` | `output:"sales_2024.csv"` | Actual file name | | Missing annotation | Added for aggregation step | Complete workflow | | `label:"final step"` | `label:"Export Regional Report"` | Specific action | | No node_type | Explicit input/process/output | Proper diagram shapes | ### Workflow for Legacy Code For existing codebases without any annotations: ```{r eval=FALSE} # 1. Start with auto-detection auto_workflow <- put_auto("./legacy_code/", recursive = TRUE) put_diagram(auto_workflow) # Get initial picture # 2. Generate annotation suggestions put_generate("./legacy_code/", output = "clipboard") # Paste into files and customize # 3. Add manual annotations for key files # Focus on main entry points first # 4. Merge for complete picture final <- put_merge("./legacy_code/", merge_strategy = "supplement", recursive = TRUE) put_diagram(final, show_source_info = TRUE) ``` --- ## Tips for Large Workflows When working with complex workflows: 1. **Use meaningful IDs**: Choose IDs that reflect the step's purpose 2. **Group related files**: Organize scripts into subdirectories 3. **Use subgraphs**: Group related nodes with `show_source_info = TRUE, source_info_style = "subgraph"` 4. **Consider direction**: Use `direction = "LR"` for wide workflows, `direction = "TD"` for deep ones 5. **Show artifacts selectively**: Use `show_artifacts = TRUE` only when data lineage matters ```r # For large workflows, consider: put_diagram(workflow, direction = "LR", # Left-to-right for wide pipelines show_source_info = TRUE, # Show file names source_info_style = "subgraph",# Group by file theme = "minimal" # Clean look for complex diagrams ) ``` ## Try It Yourself Run the built-in examples: ```{r eval=FALSE} # Basic example source(system.file("examples", "reprex.R", package = "putior")) # Data science workflow source(system.file("examples", "data-science-workflow.R", package = "putior")) # Self-documentation (putior documents itself!) source(system.file("examples", "self-documentation.R", package = "putior")) ``` --- ## See Also | Guide | Description | |-------|-------------| | [Quick Start](quick-start.html) | First diagram in 2 minutes | | [Annotation Guide](annotation-guide.html) | Complete syntax reference | | [Features Tour](features-tour.html) | Auto-detection, themes, logging | | [API Reference](api-reference.html) | Function documentation | | [Quick Reference](quick-reference.html) | At-a-glance reference card | | [Troubleshooting](troubleshooting.html) | Common issues and solutions | | [AI Integration](ai-integration.html) | MCP/ACP integration guide |