Skip to content

Steps API

Steps are the building blocks of SpiderChef recipes. This reference documents the core step classes and the built-in step types available in SpiderChef.

Core Step Classes

spiderchef.steps.base.BaseStep

Bases: ABC, BaseModel

Base step class that all steps inherit from.

Source code in spiderchef/steps/base.py
class BaseStep(ABC, BaseModel):
    """Base step class that all steps inherit from."""

    name: str = ""
    step_registry: ClassVar[dict[str, type["BaseStep"]]] = {}
    use_previous_output: bool = True

    def _replace(self, variables: dict[str, Any], value: str) -> str:
        """Replaces values if any variables are found."""
        result = value
        for var_name, var_value in variables.items():
            placeholder = f"${{{var_name}}}"
            if placeholder in result:
                result = result.replace(placeholder, str(var_value))

        # Handle ${env.VAR_NAME}
        def env_replacer(match) -> str:
            env_var = match.group(1)
            try:
                return env.str(env_var)
            except Exception:
                raise ValueError(
                    f"Environment variable '{env_var}' not found or invalid"
                )

        result = RE_ENV_VAR.sub(env_replacer, result)

        if "${" in result:
            if unreplaced_var := RE_VAR.findall(result):
                raise ValueError(
                    f"Variable '{unreplaced_var}' not found in Recipe.variables"
                )

        return result

    def _contains_variables(self, structure) -> bool:
        """Check if structure contains any strings with variables."""
        if isinstance(structure, str):
            return "${" in structure
        elif isinstance(structure, dict):
            return any(self._contains_variables(v) for v in structure.values())
        elif isinstance(structure, list):
            return any(self._contains_variables(item) for item in structure)
        return False

    def _replace_in_structure(self, variables: dict[str, Any], structure: Any) -> Any:
        """Replace variables in nested structures."""
        if isinstance(structure, str):
            structure = self._replace(variables, structure)
        elif isinstance(structure, dict):
            structure = {
                k: self._replace_in_structure(variables, v)
                for k, v in structure.items()
            }
        elif isinstance(structure, list):
            structure = [
                self._replace_in_structure(variables, item) for item in structure
            ]
        return structure

    def replace_variables(self, recipe: "Recipe") -> None:
        """Replace variables in all fields of the step."""
        if not recipe.variables:
            return

        for name, field in self:
            if self._contains_variables(field):
                setattr(self, name, self._replace_in_structure(recipe.variables, field))

    @abstractmethod
    def execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        """Execute the step and return the result."""
        self.replace_variables(recipe)

execute(recipe, previous_output=None) abstractmethod

Execute the step and return the result.

Source code in spiderchef/steps/base.py
@abstractmethod
def execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
    """Execute the step and return the result."""
    self.replace_variables(recipe)

replace_variables(recipe)

Replace variables in all fields of the step.

Source code in spiderchef/steps/base.py
def replace_variables(self, recipe: "Recipe") -> None:
    """Replace variables in all fields of the step."""
    if not recipe.variables:
        return

    for name, field in self:
        if self._contains_variables(field):
            setattr(self, name, self._replace_in_structure(recipe.variables, field))

spiderchef.steps.base.SyncStep

Bases: BaseStep

Base class for synchronous steps.

Source code in spiderchef/steps/base.py
class SyncStep(BaseStep):
    """Base class for synchronous steps."""

    def execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        super().execute(recipe)
        return self._execute(recipe, previous_output)

    @abstractmethod
    def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        """Implementation of the step logic."""
        pass

spiderchef.steps.base.AsyncStep

Bases: BaseStep

Base class for asynchronous steps.

Source code in spiderchef/steps/base.py
class AsyncStep(BaseStep):
    """Base class for asynchronous steps."""

    async def execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        super().execute(recipe)
        return await self._execute(recipe, previous_output)

    @abstractmethod
    async def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        """Implementation of the step logic."""
        pass

spiderchef.steps.asynchronous.FetchStep

Bases: AsyncStep

Step to fetch data from an API.

Source code in spiderchef/steps/asynchronous.py
class FetchStep(AsyncStep):
    """Step to fetch data from an API."""

    assign_to_base: bool = True
    return_type: Literal["text", "json", "response"] = "text"
    method: Literal["GET", "POST"] = "GET"
    path: str = ""
    params: dict[str, Any] = Field(default_factory=dict)
    json_data: dict[str, Any] = Field(default_factory=dict)
    data: dict[str, Any] | str = Field(default_factory=dict)
    headers: dict[str, Any] = Field(default_factory=dict)
    ok_status_codes: list[int] = Field(default_factory=lambda: [200])
    timeout: int = 5

    def validate_response(self, response: Response) -> None:
        if response.status_code not in self.ok_status_codes:
            raise ResponseIsNotOkError(response.status_code)

    async def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        session = await recipe.session
        match self.method:
            case "GET":
                response = await session.get(
                    url=self.path,
                    params=self.params,
                    timeout=self.timeout,
                    headers=self.headers,
                )
            case "POST":
                if self.data:
                    response = await session.post(
                        url=self.path,
                        params=self.params,
                        data=self.data,
                        timeout=self.timeout,
                        headers=self.headers,
                    )
                else:
                    response = await session.post(
                        url=self.path,
                        params=self.params,
                        json=self.json_data,
                        timeout=self.timeout,
                        headers=self.headers,
                    )
        self.validate_response(response)
        if self.assign_to_base:
            recipe.text_response = response.text
        if self.return_type == "json" and self.assign_to_base:
            recipe.json_response = response.json()
        match self.return_type:
            case "json":
                return response.json()
            case "text":
                return response.text
            case "response":
                return response

spiderchef.steps.asynchronous.SleepStep

Bases: AsyncStep

Source code in spiderchef/steps/asynchronous.py
class SleepStep(AsyncStep):
    timeout: int = 3

    async def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        await asyncio.sleep(self.timeout)
        return previous_output

Extraction Steps

spiderchef.steps.extract.RegexStep

Bases: SyncStep

Step to regex a value from the recipe's text data.

Source code in spiderchef/steps/extract.py
class RegexStep(SyncStep):
    """Step to regex a value from the recipe's text data."""

    index: int | None = None
    expression: str

    def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        items = []
        if not self.use_previous_output and recipe.text_response:
            items = findall(self.expression, recipe.text_response)
        else:
            items = findall(self.expression, previous_output)
        if isinstance(self.index, int) and items and len(items) >= self.index:
            return items[self.index]
        return items

spiderchef.steps.extract.RegexFirstStep

Bases: RegexStep

Source code in spiderchef/steps/extract.py
class RegexFirstStep(RegexStep):
    index: int | None = 0

spiderchef.steps.extract.XpathStep

Bases: SyncStep

Step to xpath a value from the recipe's text data.

Source code in spiderchef/steps/extract.py
class XpathStep(SyncStep):
    """Step to xpath a value from the recipe's text data."""

    expression: str
    return_type: Literal["text", "html"] = "html"
    rebuild_tree: bool = False
    index: int | None = None

    def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        tree = None
        output = []
        if not self.use_previous_output and not recipe._tree or self.rebuild_tree:
            recipe._tree = tree = fromstring(recipe.text_response)
        if self.use_previous_output and isinstance(previous_output, str):
            tree = fromstring(previous_output)
        if tree is not None:
            for i in tree.xpath(self.expression):
                if isinstance(i, str):
                    output.append(i)
                elif self.return_type == "text":
                    output.append("".join(i.itertext()))
                else:
                    item = tostring(i, encoding="utf-8")
                    output.append(item.decode() if isinstance(item, bytes) else item)
        if isinstance(self.index, int) and output and len(output) >= self.index:
            return output[self.index]
        return output

spiderchef.steps.extract.XpathFirstStep

Bases: XpathStep

Source code in spiderchef/steps/extract.py
class XpathFirstStep(XpathStep):
    index: int | None = 0

spiderchef.steps.extract.GetStep

Bases: SyncStep

Step to get a value from the recipe's JSON data.

Source code in spiderchef/steps/extract.py
class GetStep(SyncStep):
    """Step to get a value from the recipe's JSON data."""

    expression: str

    def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        if "[]" in self.expression:
            expr = self.expression.replace("[]", "")
            json_value = (
                previous_output if self.use_previous_output else recipe.json_response
            )
            for i in expr.split("."):
                if isinstance(json_value, dict):
                    json_value = get(json_value, i)
                elif isinstance(json_value, list):
                    json_value = [get(val, i) for val in json_value]
            return json_value
        return get(
            previous_output if self.use_previous_output else recipe.json_response,
            self.expression,
        )

spiderchef.steps.extract.ExtractItemsStep

Bases: AsyncStep

Step to regex a value from the recipe's text data.

Source code in spiderchef/steps/extract.py
class ExtractItemsStep(AsyncStep):
    """Step to regex a value from the recipe's text data."""

    expression: str
    expression_type: Literal["json", "xpath", "regex"] = "regex"
    items: dict[str, list[BaseStep | dict[str, Any]]]

    @field_validator("items", mode="before")
    def convert_step_dicts(
        cls, value: dict[str, list[dict]]
    ) -> dict[str, list[BaseStep]]:
        """Convert step dictionaries to Step instances before model creation."""
        converted_steps = {}
        for item, steps in value.items():
            converted_steps[item] = convert_steps(cls.step_registry, steps)

        return converted_steps

    async def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        outputs = []
        match self.expression_type:
            case "json":
                extraction_cls = GetStep
            case "xpath":
                extraction_cls = XpathStep
            case "regex":
                extraction_cls = RegexStep

        data_items = extraction_cls(
            expression=self.expression,
            use_previous_output=self.use_previous_output,
        ).execute(recipe, previous_output)
        if not data_items:
            return []
        for data_number, data in enumerate(data_items, start=1):
            output = {}
            log.info(f"  ➡️  {data_number}.  Extracting item ")
            for item_number, item in zip(string.ascii_lowercase, self.items.items()):
                item, steps = item
                item_output = data
                log.info(f"    ➡️  {item_number}.  Extracting {item}...")
                for step in steps:
                    step = cast(BaseStep, step)
                    if issubclass(type(step), AsyncStep):
                        item_output = await step.execute(recipe, item_output)
                    else:
                        item_output = step.execute(recipe, item_output)
                output[item] = item_output
            outputs.append(output)
        return outputs

convert_step_dicts(value)

Convert step dictionaries to Step instances before model creation.

Source code in spiderchef/steps/extract.py
@field_validator("items", mode="before")
def convert_step_dicts(
    cls, value: dict[str, list[dict]]
) -> dict[str, list[BaseStep]]:
    """Convert step dictionaries to Step instances before model creation."""
    converted_steps = {}
    for item, steps in value.items():
        converted_steps[item] = convert_steps(cls.step_registry, steps)

    return converted_steps

Transformation Steps

spiderchef.steps.format.JoinBaseUrl

Bases: SyncStep

Joins a string (+optional path) with base_url

Source code in spiderchef/steps/format.py
class JoinBaseUrl(SyncStep):
    """Joins a string (+optional path) with base_url"""

    base_url: str = ""
    path: str = ""
    suffix: str = ""

    def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        if not (base_url := self.base_url):
            base_url = recipe.base_url
        if isinstance(previous_output, list):
            previous_output = [
                urljoin(base_url, self.path + str(i)) + self.suffix
                for i in previous_output
            ]
        else:
            previous_output = urljoin(
                base_url, self.path + str(previous_output) + self.suffix
            )
        return previous_output

spiderchef.steps.format.FromJson

Bases: SyncStep

Convert from json.

Source code in spiderchef/steps/format.py
class FromJson(SyncStep):
    """Convert from json."""

    def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        return (
            loads(previous_output.encode())
            if isinstance(previous_output, str)
            else previous_output
        )

spiderchef.steps.format.ToInt

Bases: SyncStep

Convert to integer.

Source code in spiderchef/steps/format.py
class ToInt(SyncStep):
    """Convert to integer."""

    def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        return int(float(previous_output))

spiderchef.steps.format.ToStr

Bases: SyncStep

Convert to string.

Source code in spiderchef/steps/format.py
class ToStr(SyncStep):
    """Convert to string."""

    def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        return str(previous_output)

spiderchef.steps.format.ToFloat

Bases: SyncStep

Convert to float.

Source code in spiderchef/steps/format.py
class ToFloat(SyncStep):
    """Convert to float."""

    def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        return float(previous_output)

spiderchef.steps.format.ToMoneyStep

Bases: SyncStep

Converts string to money format.

Handles different currency formats, decimal separators, and thousands separators. Examples: - '1.407' → 1407 (if period is thousands separator) - '1,407.99' → 1407.99 (US format) - '1.407,99' → 1407.99 (EU format)

Source code in spiderchef/steps/format.py
class ToMoneyStep(SyncStep):
    """Converts string to money format.

    Handles different currency formats, decimal separators, and thousands separators.
    Examples:
    - '1.407' → 1407 (if period is thousands separator)
    - '1,407.99' → 1407.99 (US format)
    - '1.407,99' → 1407.99 (EU format)
    """

    decimal_separator: str = ","
    thousands_separator: str = "."

    def _execute(self, recipe: "Recipe", previous_output: Any = None) -> float | None:
        if previous_output is None:
            return None

        value = RemoveCurrencySymbols().execute(recipe, str(previous_output).strip())

        # Handle different formats
        if self.decimal_separator == "." and self.thousands_separator == ",":
            # US format: 1,234.56
            value = value.replace(self.thousands_separator, "")
        elif self.decimal_separator == "," and self.thousands_separator == ".":
            # EU format: 1.234,56
            value = value.replace(self.thousands_separator, "")
            value = value.replace(self.decimal_separator, ".")
        elif self.decimal_separator == "." and "," not in value:
            # Format like '1.407' without any decimal part
            # Count dots to determine if it's a thousands separator or decimal
            if value.count(".") == 1 and len(value.split(".")[-1]) <= 2:
                # Likely a decimal separator: 1.40
                pass
            else:
                # Likely a thousands separator: 1.407 → 1407
                value = value.replace(".", "")

        try:
            return float(value)
        except ValueError:
            log.warning(f"Could not convert '{previous_output}' to money value")
            return None

spiderchef.steps.format.RemoveExtraWhitespace

Bases: SyncStep

Remove extra whitespace from strings.

Source code in spiderchef/steps/format.py
class RemoveExtraWhitespace(SyncStep):
    """Remove extra whitespace from strings."""

    def _execute(self, recipe: "Recipe", previous_output: str = "") -> Any:
        return RE_WHITESPACE_CHARS.sub(" ", previous_output)

spiderchef.steps.format.RemoveHTMLTags

Bases: SyncStep

Removes HTML Tags from strings.

Source code in spiderchef/steps/format.py
class RemoveHTMLTags(SyncStep):
    """Removes HTML Tags from strings."""

    def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        return RE_HTML_TAGS.sub("", previous_output)

Flow Control Steps

spiderchef.steps.conditional.CompareStep

Bases: SyncStep

Step to compare two values from the recipe's JSON data.

Source code in spiderchef/steps/conditional.py
class CompareStep(SyncStep):
    """Step to compare two values from the recipe's JSON data."""

    left_key: str | None = None
    right_key: str | None = None
    compare_to: Any = None
    condition: Literal[
        "gt",
        "lt",
        "eq",
        "gte",
        "lte",
    ]

    def _execute(self, recipe: "Recipe", previous_output: Any = None) -> bool:
        def get_value(json_response: dict[str, Any], key: str) -> float:
            if value := get(json_response, key):
                return value if isinstance(value, float | int) else len(value)
            raise ValueError(f"Could not get value for key: {key}")

        right_value = None
        if self.use_previous_output:
            json_response = previous_output
        else:
            json_response = recipe.json_response

        if self.left_key is not None:
            left_value = get_value(json_response, self.left_key)
        else:
            left_value = previous_output
        if self.right_key is not None:
            right_value = get_value(json_response, self.right_key)
        elif self.compare_to is not None:
            right_value = self.compare_to

        # Ensure both values are not None before comparison
        if right_value is None or left_value is None:  # pragma: no cover
            raise ValueError(
                f"Cannot compare: left value ({left_value}) or right value ({right_value}) is None"
            )

        match self.condition:
            case "eq":
                return left_value == right_value
            case "gt":
                return left_value > right_value
            case "lt":
                return left_value < right_value
            case "gte":
                return left_value >= right_value
            case "lte":
                return left_value <= right_value
            case _:  # pragma: no cover
                raise ValueError(f"Unknown condition: {self.condition}")

Data Management Steps

spiderchef.steps.base.SaveStep

Bases: SyncStep

Saves the previous_output into the variables to be used later on.

Source code in spiderchef/steps/base.py
class SaveStep(SyncStep):
    """Saves the previous_output into the variables to be used later on."""

    variable: str

    def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        recipe.variables[self.variable] = previous_output
        return previous_output

Error Handling Steps

spiderchef.steps.error.TryCatchStep

Bases: AsyncStep

Execute steps with error handling.

Source code in spiderchef/steps/error.py
class TryCatchStep(AsyncStep):
    """Execute steps with error handling."""

    try_steps: list[BaseStep]
    catch_steps: list[BaseStep] = []
    finally_steps: list[BaseStep] = []

    @field_validator("try_steps", "catch_steps", "finally_steps", mode="before")
    def convert_steps(cls, value: list[dict[str, Any]]) -> list[BaseStep]:
        """Convert step dictionaries to Step instances before model creation."""
        return convert_steps(cls.step_registry, value)

    async def _execute(self, recipe: "Recipe", previous_output: Any = None) -> Any:
        result = previous_output

        try:
            for step in self.try_steps:
                result = step.execute(recipe, result)
        except Exception as e:
            # Save error in variables for catch steps
            recipe.variables["error"] = str(e)
            recipe.variables["error_type"] = e.__class__.__name__
            log.error(f"{str(e)} caught!", error_type=e.__class__.__name__)
            # Execute catch steps
            catch_result = previous_output
            for step in self.catch_steps:
                catch_result = step.execute(recipe, catch_result)
            result = catch_result

        finally:
            # Execute finally steps
            finally_result = result
            for step in self.finally_steps:
                finally_result = step.execute(recipe, finally_result)
            result = finally_result

        return result

convert_steps(value)

Convert step dictionaries to Step instances before model creation.

Source code in spiderchef/steps/error.py
@field_validator("try_steps", "catch_steps", "finally_steps", mode="before")
def convert_steps(cls, value: list[dict[str, Any]]) -> list[BaseStep]:
    """Convert step dictionaries to Step instances before model creation."""
    return convert_steps(cls.step_registry, value)