Skip to content
Arxo Arxo

Policy and CI Gates

Use these policies to move from visibility to enforcement for fine-tuning architecture risk.

metrics:
- id: finetuning_architecture
policy:
invariants:
- metric: finetuning_architecture.overall_finetuning_health
op: ">="
value: 0.65
severity: warning
message: "Raise fine-tuning architecture health above minimum baseline"
- metric: finetuning_architecture.reproducibility_score
op: ">="
value: 0.65
severity: warning
message: "Improve reproducibility controls"
- metric: finetuning_architecture.data_integrity_score
op: ">="
value: 0.65
severity: warning
message: "Improve data integrity and eval maturity"
- metric: finetuning_architecture.safety_governance_score
op: ">="
value: 0.65
severity: warning
message: "Improve artifact trust/privacy/governance controls"
metrics:
- id: finetuning_architecture
config:
profile: "sft"
require_base_pinning: true
require_eval_harness: true
require_full_determinism: false
require_checkpoint_eval_lineage: true
require_safe_serialization: true
privacy_profile: "recordkeeping"
policy:
invariants:
- metric: finetuning_architecture.overall_finetuning_health
op: ">="
value: 0.70
severity: warning
message: "Overall fine-tuning architecture health baseline not met"
- metric: finetuning_architecture.base_model_versioning_score
op: ">="
value: 0.80
severity: warning
message: "Base model pinning baseline not met"
- metric: finetuning_architecture.eval_absence_score
op: ">="
value: 0.75
severity: warning
message: "Eval readiness baseline not met"
- metric: finetuning_architecture.checkpoint_eval_lineage_score
op: ">="
value: 0.80
severity: warning
message: "Checkpoint-eval lineage baseline not met"
- metric: finetuning_architecture.artifact_trust_surface_score
op: ">="
value: 0.85
severity: warning
message: "Artifact trust/safe serialization baseline not met"
- metric: finetuning_architecture.dataset_contamination_score
op: ">="
value: 0.75
severity: warning
message: "Dataset contamination controls are insufficient"
metrics:
- id: finetuning_architecture
config:
profile: "rft"
require_base_pinning: true
require_eval_harness: true
require_full_determinism: true
require_preference_eval: true
require_checkpoint_eval_lineage: true
require_safe_serialization: true
privacy_profile: "strict"
policy:
invariants:
- metric: finetuning_architecture.overall_finetuning_health
op: ">="
value: 0.80
severity: error
message: "Overall fine-tuning architecture health baseline not met"
- metric: finetuning_architecture.method_integrity_score
op: ">="
value: 0.80
severity: error
message: "Post-training method integrity baseline not met"
- metric: finetuning_architecture.checkpoint_eval_lineage_score
op: ">="
value: 0.80
severity: error
message: "Checkpoint-eval lineage baseline not met"
- metric: finetuning_architecture.artifact_trust_surface_score
op: ">="
value: 0.85
severity: error
message: "Artifact trust/safe serialization baseline not met"
- metric: finetuning_architecture.privacy_recordkeeping_score
op: ">="
value: 0.80
severity: error
message: "Privacy and recordkeeping controls are insufficient"
- metric: finetuning_architecture.dataset_contamination_score
op: ">="
value: 0.85
severity: error
message: "Dataset contamination controls are insufficient"
metrics:
- id: finetuning_architecture
policy:
baseline:
mode: git
ref: origin/main
invariants:
- metric: finetuning_architecture.overall_finetuning_health
op: ">="
baseline: true
severity: error
message: "Overall fine-tuning architecture health regressed vs baseline"
- metric: finetuning_architecture.eval_absence_score
op: ">="
baseline: true
severity: error
message: "Eval readiness regressed vs baseline"
- metric: finetuning_architecture.checkpoint_eval_lineage_score
op: ">="
baseline: true
severity: error
message: "Checkpoint-eval lineage regressed vs baseline"
Terminal window
# Focused fine-tuning architecture run
arxo analyze --path . --metric finetuning_architecture
Terminal window
# AI preset run (includes finetuning_architecture)
arxo analyze --path . --preset ai --config arxo.yml --fail-fast
  1. Start with warning-level thresholds for 1-2 release cycles.
  2. Fix recurring low-score categories in central training modules first.
  3. Promote strict gates to error once score trends stabilize.
  4. Keep baseline no-regression checks enabled to prevent drift.